# WORD2VEC NOTEBOOK

#### Josh Gen (jdg9vr@virginia.edu) DS 5001 Spring 2023

In [2]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly.express as px

In [3]:
TOKENS = pd.read_csv('./data/TOKENS.csv', index_col=['company_id', 'link_num', 'sent_num', 'token_num'])
VOCAB = pd.read_csv('./data/VOCAB.csv', index_col='term_str')

In [6]:
test = TOKENS.loc[(TOKENS['pos'].str.startswith(('VB', 'NN')))]

# gensim format for word2vec
BAG = ['company_id'] # bag is sent here

CORPUS_w2v = test\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
CORPUS_w2v = [doc for doc in CORPUS_w2v if len(doc) > 1] # Lose single word docs

rowling_w2v_params = dict(window = 2,
                          vector_size = 256,
                          min_count = 80)

rowling_model = word2vec.Word2Vec(CORPUS_w2v, **rowling_w2v_params) # by chaps

In [16]:
rowling_model.save('./visualization_data/rowling_model.model')

In [7]:
rowling_model

<gensim.models.word2vec.Word2Vec at 0x7f95cba87730>

In [8]:
# get coords
rowling_coords = pd.DataFrame(
    dict(
        vector = [rowling_model.wv.get_vector(w) for w in rowling_model.wv.index_to_key], 
        term_str = list(rowling_model.wv.index_to_key)
    )).set_index('term_str')

# model building - set new model params if you want
learning_rate = 200
perplexity = 20
n_comps = 2
init = 'random'
n_iter = 1000
rand_state = 42

tsne_engine = TSNE(perplexity=perplexity, n_components=n_comps, init=init, n_iter=n_iter, random_state=rand_state)
rowling_tsne_model = tsne_engine.fit_transform(np.asarray(rowling_coords.vector.to_list()))

rowling_coords['x'] =rowling_tsne_model[:,0]
rowling_coords['y'] = rowling_tsne_model[:,1]

VOCAB_tsne = pd.merge(VOCAB, rowling_coords, left_index=True, right_index=True)

# for visualization, drop stopwords
VOCAB_tsne = VOCAB_tsne[VOCAB_tsne.stop == 0]

In [9]:
rowling_coords

Unnamed: 0_level_0,vector,x,y
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
quality,"[-0.0146646295, -0.13863765, 0.021005685, 0.09...",-8.333534,4.761418
casting,"[-0.036117807, -0.13921943, 0.010448922, 0.085...",-16.346945,0.786622
products,"[-0.005714393, -0.13743775, 0.020179514, 0.083...",2.973101,-2.577197
castings,"[-0.02907663, -0.13700709, 0.011769329, 0.0889...",-16.012167,0.414420
aluminum,"[-0.036465768, -0.12206493, 0.017804531, 0.077...",-19.450926,0.306172
...,...,...,...
powder,"[-0.008735385, -0.14108025, 0.024057467, 0.094...",-6.914814,0.210390
touch,"[-0.007386874, -0.12329233, 0.030975277, 0.090...",5.920400,6.849392
titanium,"[-0.009455427, -0.14131553, 0.019787958, 0.098...",-12.157339,-6.601217
resistance,"[-0.015976345, -0.13784921, 0.028640041, 0.094...",-12.025945,-5.590615


In [55]:
VOCAB_tsne.to_csv('./data/VOCAB_tsne.csv')

In [50]:
VOCAB_tsne.head()

Unnamed: 0_level_0,n,p,i,n_chars,max_pos,n_pos,cat_pos,stop,dfidf,mean_tfidf,vector,x,y
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
quality,747,0.009743,6.681354,7,NN,2,"{'NNP', 'NN'}",0,10.947234,0.019094,"[-0.047723144, -0.14348537, 0.006091879, 0.067...",-6.041846,5.582057
casting,721,0.009404,6.732463,7,NNP,2,"{'NN', 'NNP'}",0,23.3636,0.059843,"[-0.070645384, -0.14479819, -0.0047444934, 0.0...",-13.957783,6.768412
products,668,0.008713,6.842614,8,NNS,3,"{'NNS', 'NNP', 'NNPS'}",0,14.515279,0.022575,"[-0.041239113, -0.14903277, 0.010228972, 0.065...",0.773703,1.298489
castings,510,0.006652,7.231965,8,NNS,3,"{'NNS', 'NNPS', 'NNP'}",0,23.789373,0.052503,"[-0.06112058, -0.15033749, -0.0015458427, 0.06...",-14.461075,3.122146
aluminum,476,0.006209,7.3315,8,NNP,2,"{'NN', 'NNP'}",0,23.674322,0.042434,"[-0.06906438, -0.13637207, 0.003292611, 0.0593...",-15.091912,7.615201


In [29]:
# plot rowling
px.scatter(VOCAB_tsne.reset_index(), 'x', 'y', title = 'Rowling tSNE',
           text='term_str',  
           hover_name='term_str',          
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

In [10]:
def complete_analogy(A, B, C, model, n=2):
    cols = ['term', 'sim']
    return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)

def get_most_similar(positive, model, negative=None):
    return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])

In [14]:
complete_analogy('metal', 'machines', 'cookies', rowling_model, 5)

Unnamed: 0,term,sim
0,use,0.991016
1,site,0.989835
2,information,0.987346
3,website,0.987052
4,web,0.986639


In [33]:
# complete_analogy('WORD1', 'WORD2', 'WORD3', rowling_model, 5)

In [42]:
get_most_similar('castings', rowling_model)

Unnamed: 0,term,sim
0,cast,0.998325
1,mold,0.998114
2,process,0.997893
3,molding,0.997855
4,metal,0.997711
5,core,0.997698
6,machines,0.997623
7,pressure,0.997437
8,casting,0.997353
9,parts,0.997297


In [53]:
get_most_similar('cookies', rowling_model)

Unnamed: 0,term,sim
0,use,0.994904
1,site,0.991266
2,data,0.990709
3,web,0.990343
4,website,0.989468
5,information,0.988451
6,terms,0.985402
7,privacy,0.980779
8,learn,0.971127
9,policy,0.965286


In [54]:
get_most_similar('policy', rowling_model)

Unnamed: 0,term,sim
0,please,0.996926
1,rights,0.99671
2,privacy,0.99601
3,service,0.995854
4,services,0.995512
5,info,0.99543
6,links,0.995226
7,learn,0.994923
8,form,0.994366
9,contact,0.993916
