In [8]:
#Packages
import pandas as pd
import numpy as np
import nltk 
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px

path = '/Users/kritchanwong/Downloads/Plato-project-5001-main'
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

#Configuring Word2Vec Parameters
PARA = OHCO[:3] # Paragraphs
SENT = OHCO[:4] # Sentences
BAG = PARA

w2v_params = dict(
    window = 5,
    vector_size = 246,
    min_count = 50,
    workers = 4
)

#Importing Data 
LIB = pd.read_csv(path + '/data_out/' + 'lib_plato.csv')
TOKENS = pd.read_csv(path + '/data_out/' + 'token_plato.csv').set_index(OHCO)
VOCAB = pd.read_csv(path + '/data_out/' +'vocab_plato.csv').set_index('term_str')

In [9]:
#Creating Docs
DOCS = TOKENS[~TOKENS.pos.str.match('NNPS?')]\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
DOCS = [doc for doc in DOCS if len(doc) > 1] # Lose single word docs


In [10]:
#T-SNE
model = word2vec.Word2Vec(DOCS, **w2v_params)
coords = pd.DataFrame(
    dict(
        vector = [model.wv.get_vector(w) for w in model.wv.key_to_index.values()], 
        term_str = model.wv.index_to_key
    )).set_index('term_str')

tsne_engine = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_model = tsne_engine.fit_transform(coords.vector.to_list())
coords['x'] = tsne_model[:,0]
coords['y'] = tsne_model[:,1]
if coords.shape[1] == 3:
    coords = coords.merge(VOCAB.reset_index(), on='term_str')
    coords = coords.set_index('term_str')
coords = coords[coords.stop == 0]
px.scatter(coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_max', 
           hover_name='term_str',          
           size='tfidf_max_sum',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')



In [12]:
#Embedding Tables
W2V = pd.DataFrame(model.wv.get_normed_vectors(), index=model.wv.key_to_index.values())
proper_nouns = 'tommy emily tuppence agnes edmund holmes'.split()
coords2 = coords.loc[~coords.index.isin(proper_nouns), ['vector','stop','tfidf_n_sum','pos_max']]

Unnamed: 0_level_0,vector,stop,tfidf_n_sum,pos_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,"[0.60540736, -0.83902884, 0.3435921, 0.1108872...",0,0.000000,","
one,"[0.83924425, -1.2170938, 0.087690875, 0.379492...",0,0.345465,CD
socrates,"[0.8314621, -0.091369726, 0.28118065, -0.02998...",0,0.374749,NN
said,"[0.8598088, -0.2775373, 0.56301934, -0.7726588...",0,0.236581,VB
would,"[-0.38315946, 0.09343991, -0.26592836, -0.4381...",0,0.209604,MD
...,...,...,...,...
courts,"[0.039160714, -0.013886101, -0.04626172, -0.02...",0,0.004437,NN
advise,"[0.10763804, 0.11157992, 0.019233162, 0.015496...",0,0.004437,VB
greatly,"[0.08748324, -0.057662092, 0.07921674, -0.0051...",0,0.004437,RB
pleased,"[0.042512387, -0.1765763, 0.05734038, 0.021338...",0,0.001720,JJ
