In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models.word2vec import LineSentence

from datetime import datetime
import pandas as pd
import sqlite3

from params import tokenized_sents_file, embedding_results_file, chosen_wv_model, wvdf_file, binary_wv_file

## Setup

In [None]:
model_path="./models/"

## Train and test embedding models

In [None]:
# Also: CBOW, negative sampling (with negative=5)
window_size = [5,7,10,15,25,50]
vector_size = [200,250,300]

In [None]:
r_window = []
r_vector = []
r_google = []
r_wordsim = []

for window in window_size:
    for vector in vector_size:
        name = f"gensim_model_window{window}_vector{vector}"
        print(f"Starting with {name} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        model = Word2Vec(corpus_file=tokenized_sents_file, vector_size=vector, window=window, epochs=10, min_count=15)

        google_test = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
        print(google_test[0])
        similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
        print(similarities)
        print()
        print("-"*80)
        
        r_window.append(window)
        r_vector.append(vector)
        r_google.append(google_test[0])
        r_wordsim.append(similarities)

        filename=f"{model_path}{name}"
        model.save(filename)
    
print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
results = pd.DataFrame(data={'Window': r_window,'Vector': r_vector,
                'Google': r_google, 'Wordsim': r_wordsim})

results['Sp_corr'] = [i[1][0] for i in results.Wordsim.tolist()]
results.to_csv(embedding_results_file, index=False)

In [None]:
results.sort_values('Sp_corr', ascending=False)
# Wordsim353 better than Google results according to:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating

## Save chosen model in matrix format

For the use with CMD in R

In [None]:
wv = KeyedVectors.load(chosen_wv_model).wv

In [None]:
wvdf = pd.DataFrame(wv.vectors, index=wv.key_to_index)
wvdf.sort_values(0, ascending=False).head()

In [None]:
wvdf.to_csv(wvdf_file)

In [None]:
# Used to read the model into R for word2vec library
wv.save_word2vec_format(binary_wv_file, binary=True)

## Inspect model

In [None]:
wv.most_similar(positive=["metoo", "sexism", "sexist", "sexual_harassment", "misogyny", "patriarchy",
                         "sexualization", "sjws", "rape_culture", "toxic_masculinity", "harassing"], topn=25)

In [None]:
wv.most_similar(positive=["blm", "racism","racist", "racists", "african_americans",
                         "racial", "segregation", "systemic_racism", "police_brutality", "white_supremacy",
                         "institutional_racism", "race_relations", "bigoted"], topn=25)

In [None]:
wv.most_similar("discrimination", topn=25)