In [32]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

w2v_params = dict(
    window = 5,
    vector_size = 246,
    min_count = 50, # THIS LIMITS OUR VOCAB
    workers = 4
)
from IPython.display import display


In [33]:
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_str')
TOKENS = pd.read_csv('CORPUS.csv').set_index('speech_id')
BOW = pd.read_csv('BOW.csv').set_index('term_str')
VOCAB = VOCAB[~VOCAB.index.isna()] # There is NaN in the index for some reason
BOW = BOW[~BOW.index.isna()] # There is NaN in the index for some reason

VOCAB= VOCAB.join(BOW['tfidf'])


## VOCAB_W2V

In [34]:
VOCAB =VOCAB.reset_index()
VOCAB = VOCAB.drop_duplicates(subset='term_str', keep='first').set_index('term_str')

In [35]:
VOCAB['pos_max'] = TOKENS.groupby(['term_str','pos']).pos.count().unstack().idxmax(axis=1)


In [36]:

docs = TOKENS[~TOKENS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby('speech_id')\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] # Lose single word docs

vocab = Dictionary(docs)


In [37]:
model = word2vec.Word2Vec(docs, **w2v_params)


In [38]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec

In [39]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)


In [40]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(WV.to_numpy())


In [41]:
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)


In [42]:
X = TSNE.join(VOCAB, how='left')


In [43]:
X

Unnamed: 0_level_0,x,y,n,n_chars,p,i,max_pos,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,max_pos_group,dfidf,tfidf,pos_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01,48.897690,-22.363218,81,2,0.000021,15.532189,CD,1,{'CD'},0,01,01,01,CD,86.035409,0.021525,CD
03,47.563446,-25.568886,53,2,0.000014,16.144118,CD,1,{'CD'},0,03,03,03,CD,90.687760,0.013556,CD
1,-25.897913,43.851585,1674,1,0.000436,11.162955,CD,3,"{'NNP', 'CD', 'NN'}",0,1,1,1,CD,525.369344,0.005546,CD
10,58.060474,-22.678740,467,2,0.000122,13.004760,CD,2,"{'JJ', 'CD'}",0,10,10,10,CD,487.912711,0.022568,CD
100,56.927952,-20.268248,209,3,0.000054,14.164680,CD,1,{'CD'},0,100,100,100,CD,403.385836,0.001995,CD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yourself,-32.489567,30.633022,70,8,0.000018,15.742756,PRP,3,"{'PRP', 'VB', 'NN'}",1,yourself,yourself,yourself,PR,194.638489,0.183816,PRP
yourselves,-32.832424,30.911358,93,10,0.000024,15.332880,NNS,3,"{'VBZ', 'NNS', 'NN'}",1,yourselv,yourselv,yourselv,NN,245.342238,0.068752,NNS
youth,-26.806528,-18.179283,124,5,0.000032,14.917842,NN,5,"{'RB', 'NN', 'NNP', 'VB', 'JJ'}",0,youth,youth,you,NN,303.276978,0.026920,NN
zeal,-32.148338,-13.733883,118,4,0.000031,14.989396,NN,1,{'NN'},0,zeal,zeal,zeal,NN,311.479934,0.028841,NN


## Word2vec tSNE Plot

In [50]:
px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_max', 
           hover_name='term_str',          
           size='tfidf',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=6, family='Arial'),
                textposition='top center')