# Word2Vec Embeddings

In [1]:
import pandas as pd

from gensim.models import Word2Vec
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

Loading the data for which we will train word embeddings

In [27]:
col_list = ['CleanedTweet']
data = pd.DataFrame()
data = pd.read_csv('Datasets/cleaned_vaccine_tweets.csv', encoding = 'utf-8', usecols=col_list, dtype = str)
data['CleanedTweet'] = data['CleanedTweet'].apply(str)
data.head()

Unnamed: 0,CleanedTweet
0,looks like study ga device vaccine treatment l...
1,uga team say vaccine coronavirus
2,draw line wa riding followers vote jess vote a...
3,make carona vaccine make mandatory child get d...
4,exciting news second vaccine trial site


In [28]:
#create empty list
tweet_data_list = []

indv_lines = data['CleanedTweet'].values.tolist()
for line in indv_lines:
    
    #create word tokens
    rem_tok_punc = RegexpTokenizer(r'\w+')
    tokens = rem_tok_punc.tokenize(line)
    
    #append words in the tweet_data_list list
    tweet_data_list.append(tokens)

In [29]:
print(len(tweet_data_list))

195090


In [30]:
# Dimension of the word embedding
embed_dim = 100

# Train Word2Vec model
model = Word2Vec(sentences = tweet_data_list, size = embed_dim, workers = 4, min_count = 1)

In [31]:
#Save word embedding model to txt file
model_file = 'Datasets/Word2Vec_embedding.txt'
model.wv.save_word2vec_format(model_file, binary=False)

# Saving the Word2Vec file
model.save("Datasets/Word2Vec.model")

In [32]:
# Load trained Word2Vec model
model = Word2Vec.load('Datasets/Word2Vec.model')

## Exploring the vectors in the Word2Vec embeddings

In [33]:
# Finding similar words
model.wv.most_similar('sad')

[('awful', 0.7674503326416016),
 ('hate', 0.7516164779663086),
 ('crazy', 0.7159533500671387),
 ('terrible', 0.7071675062179565),
 ('upset', 0.7021864652633667),
 ('sorry', 0.6996557712554932),
 ('realize', 0.6966856718063354),
 ('angry', 0.691771924495697),
 ('ridiculous', 0.6901348233222961),
 ('sadly', 0.6826476454734802)]

In [34]:
#Performing some mathematics on word vectors queen + man - woman = ?
model.wv.most_similar_cosmul(positive=['queen','man'], negative=['woman'])

[('sleepy', 1.179116129875183),
 ('ball', 1.163985013961792),
 ('pack', 1.1633110046386719),
 ('kidding', 1.14608895778656),
 ('protagonist', 1.1370874643325806),
 ('golf', 1.1357206106185913),
 ('busy', 1.1287131309509277),
 ('basement', 1.1282422542572021),
 ('pcb', 1.1275078058242798),
 ('giggles', 1.1238651275634766)]

In [45]:
#Finding the odd word out from the list of words given
print(model.wv.doesnt_match("apple orange banana car".split()))

car


## Visualising the word embedding vectors

In [36]:
# Importing bokeh libraries for showing how words of similar context are grouped together
import bokeh.plotting as bp

from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [37]:
VocabKeys = []
for key in model.wv.vocab.keys():
    VocabKeys.append(key)


#Defining the chart
output_notebook()
plot_chart = bp.figure(plot_width=700, plot_height=600, title="A Plot of 5000 Word Vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

#Extracting the list of word vectors, limiting to 5000, each is of 200 dimensions
word_vectors = [model[w] for w in VocabKeys[:5000]]

  del sys.path[0]


## Reducing dimensionality by converting the vectors to 2d vectors

## TSNE

In [38]:
from sklearn.manifold import TSNE

In [39]:
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# Storing data in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = VocabKeys[:5000]

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.021s...
[t-SNE] Computed neighbors for 5000 samples in 4.509s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.132409
[t-SNE] KL divergence after 250 iterations with early exaggeration: 79.972420
[t-SNE] KL divergence after 1000 iterations: 2.211280


In [40]:
# Corresponding word appears when you hover on the data point.
plot_chart.scatter(x='x', y='y', source=tsne_df)
hover = plot_chart.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_chart)

## PCA

In [41]:
from sklearn.decomposition import PCA

In [42]:
pca_model = PCA(n_components=2, random_state=0)
pca_w2v = pca_model.fit_transform(word_vectors)

# Storing data in a dataframe
pca_df = pd.DataFrame(pca_w2v, columns=['x', 'y'])
pca_df['words'] = VocabKeys[:5000]

In [43]:
# Corresponding word appears when you hover on the data point.
plot_chart.scatter(x='x', y='y', source=pca_df)
hover = plot_chart.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_chart)