# Get word embeddings using Word2Vec

Based on https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92#702d

In [1]:
import pandas as pd
import numpy as np

In [2]:
#pip install --upgrade gensim
from gensim.models import Word2Vec

In [3]:
df = pd.DataFrame(pd.read_pickle('df_tokenized'))

In [4]:
#preprocessing - gensim requires list of list representation
df.head()
token_list=[]
for r in df['tokens']:
    token_list.append(r)

    

In [5]:
#train gensim word2vec model
model = Word2Vec(token_list, min_count=1,workers=3, window =3, sg = 1)

In [34]:
print("size of word vector: ", len(model.wv['love']))

size of word vector:  100


In [6]:
model.wv.similarity('love', 'like')

0.42362207

In [7]:
model.wv.similarity('love', 'hate')

0.47666293

In [8]:
model.wv.most_similar('love')

[('satisfy', 0.7531440258026123),
 ('roni', 0.7423168420791626),
 ('protection', 0.7260222434997559),
 ('forsake', 0.7195085287094116),
 ('everlasting', 0.7191615104675293),
 ('fair', 0.7167520523071289),
 ('behave', 0.7162327766418457),
 ('hungover', 0.7139302492141724),
 ('faithfully', 0.7131261229515076),
 ('deeply', 0.7120652794837952)]

In [9]:
#set of all tokens
tokens=[]
for r in df.tokens:
    for token in r:
        tokens.append(token)
set_tokens=set(tokens)
len(set_tokens)

16118

In [19]:
#cosine similarity
from numpy.linalg import norm
def cosine_distance (model, word,target_list , num) :
    cosine_dict ={}
    word_list = []
    a = model.wv[word]
    for item in target_list :
        if item != word :
            b = model.wv[item]
            cos_sim = np.dot(a, b)/(norm(a)*norm(b))
            cosine_dict[item] = cos_sim
    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descending order 
    for item in dist_sort:
        word_list.append((item[0], item[1]))
    return word_list[0:num]

In [20]:
cosine_distance(model,'love',set_tokens,5)

[('satisfy', 0.753144),
 ('roni', 0.74231684),
 ('protection', 0.72602224),
 ('forsake', 0.7195085),
 ('everlasting', 0.71916157)]

## embedding matrix

number of columns equal to the number of the embedding dimension (here:100)
row count: number of unique words

In [40]:
#use word vectors as features
def get_word_vectors(model, set_tokens):
    wv=[]
    for token in set_tokens:
        wv.append(model.wv[token])

    wv=pd.DataFrame(wv)
    return wv
        
df_wv=get_word_vectors(model,set_tokens)


In [41]:
df_wv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.245191,-0.150248,-0.040848,0.264840,0.185056,-0.032839,0.088284,0.186258,-0.112959,-0.218214,...,-0.156383,-0.162462,-0.189303,-0.106040,0.247375,0.066573,-0.156592,0.067085,0.068973,-0.026552
1,-0.134749,0.061137,-0.057324,0.029002,-0.138655,0.137719,-0.252304,0.055107,-0.018259,0.030587,...,-0.012598,0.115326,-0.074627,0.184383,0.442465,-0.085729,-0.077279,-0.359278,0.024115,-0.118048
2,0.175242,0.095735,-0.287328,0.323305,0.106878,0.079734,0.023414,-0.186267,-0.312048,-0.405483,...,-0.173055,0.097655,-0.478746,-0.295390,0.308281,0.223407,-0.090611,-0.339993,-0.279340,0.122755
3,-0.047591,-0.046050,-0.076501,0.083618,-0.035333,0.000125,-0.004801,0.078830,-0.117244,-0.099913,...,-0.064631,-0.011042,-0.041878,-0.008243,0.078960,0.155752,-0.075465,-0.063155,0.018741,-0.034992
4,-0.046839,-0.023255,-0.054347,0.060014,-0.010520,-0.001507,-0.033300,0.039194,-0.021815,-0.049530,...,-0.005347,0.003771,-0.015885,0.022721,0.076593,0.099197,-0.012806,-0.025491,0.023265,-0.010859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16113,-0.062630,-0.081547,-0.033649,0.155071,0.021743,-0.083295,-0.081437,0.132574,0.011042,-0.151667,...,-0.035177,-0.044614,0.023280,0.074521,0.050393,0.189543,-0.099194,-0.057143,0.140634,-0.025571
16114,-0.020387,-0.044659,-0.035326,0.057878,-0.006151,0.005317,0.012104,0.046041,-0.010458,-0.047270,...,-0.016952,0.000870,-0.006622,0.033283,0.024029,0.077976,-0.044881,-0.024860,0.013554,-0.013609
16115,0.153072,0.058740,-0.040204,0.245315,-0.565223,0.377168,-0.033026,0.203370,-0.612176,0.062836,...,-0.163426,-0.113045,-0.347899,-0.168626,0.221976,0.380825,-0.162185,-0.684901,-0.072673,-0.379267
16116,-0.032165,-0.038052,0.002486,0.068977,0.000602,-0.030664,0.017802,0.027468,0.008319,-0.065953,...,-0.021304,-0.000508,-0.006091,0.032075,0.055027,0.090529,-0.053411,-0.020530,0.042446,0.008992
