In [1]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models.word2vec import LineSentence

from datetime import datetime
import pandas as pd
import sqlite3

from params import tokenized_sents_file, embedding_results_file, chosen_wv_model, wvdf_file, binary_wv_file

## Setup

In [2]:
model_path="./models/"

## Train and test embedding models

In [3]:
# Also: CBOW, negative sampling (with negative=5)
window_size = [5,7,10,15,25,50]
vector_size = [200,250,300]

In [5]:
r_window = []
r_vector = []
r_google = []
r_wordsim = []

for window in window_size:
    for vector in vector_size:
        name = f"gensim_model_window{window}_vector{vector}"
        print(f"Starting with {name} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        model = Word2Vec(corpus_file=tokenized_sents_file, vector_size=vector, window=window, epochs=10, min_count=15)

        google_test = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
        print(google_test[0])
        similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
        print(similarities)
        print()
        print("-"*80)
        
        r_window.append(window)
        r_vector.append(vector)
        r_google.append(google_test[0])
        r_wordsim.append(similarities)

        filename=f"{model_path}{name}"
        model.save(filename)
    
print(f"Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Starting with gensim_model_window5_vector200 at 2022-03-16 13:31:44
0.4493013256897169
((0.5095863920553377, 1.541927184916423e-23), SpearmanrResult(correlation=0.5244258665600612, pvalue=4.585531883774589e-25), 5.099150141643059)

--------------------------------------------------------------------------------
Starting with gensim_model_window5_vector250 at 2022-03-16 13:43:42
0.4608563238982444
((0.5150148119701323, 4.347636749126706e-24), SpearmanrResult(correlation=0.5309885553611323, pvalue=9.160871165984586e-26), 5.099150141643059)

--------------------------------------------------------------------------------
Starting with gensim_model_window5_vector300 at 2022-03-16 13:57:49
0.46820136151916875
((0.5203568645293217, 1.223163290123219e-24), SpearmanrResult(correlation=0.5369358324886399, pvalue=2.064246661031938e-26), 5.099150141643059)

--------------------------------------------------------------------------------
Starting with gensim_model_window7_vector200 at 2022-03-16 1

In [19]:
results = pd.DataFrame(data={'Window': r_window,'Vector': r_vector,
                'Google': r_google, 'Wordsim': r_wordsim})

results['Sp_corr'] = [i[1][0] for i in results.Wordsim.tolist()]
results.to_csv(embedding_results_file, index=False)

In [22]:
results.sort_values('Sp_corr', ascending=False)
# Wordsim353 better than Google results according to:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating

Unnamed: 0,Window,Vector,Google,Wordsim,Sp_corr
14,25,300,0.459334,"((0.550451894211164, 6.241349046112461e-28), (...",0.568191
13,25,250,0.455751,"((0.5498585899604905, 7.301841576732996e-28), ...",0.565221
17,50,300,0.445091,"((0.5474191105066029, 1.3875124948537904e-27),...",0.560952
16,50,250,0.440971,"((0.5441118754925425, 3.285423012347421e-27), ...",0.555742
10,15,250,0.457363,"((0.5422540800257911, 5.3096794416499524e-27),...",0.55568
11,15,300,0.465872,"((0.5405432047799502, 8.23978718435479e-27), (...",0.555651
8,10,300,0.465425,"((0.535658226931238, 2.850177585865991e-26), (...",0.555303
9,15,200,0.451989,"((0.5391037080625402, 1.1902784549797487e-26),...",0.553484
12,25,200,0.444106,"((0.5394525150252026, 1.0889650634675925e-26),...",0.552079
7,10,250,0.460856,"((0.5303326582452582, 1.0777793999717055e-25),...",0.550365


## Save chosen model in matrix format

For the use with CMD in R

In [3]:
wv = KeyedVectors.load(chosen_wv_model).wv

In [26]:
wvdf = pd.DataFrame(wv.vectors, index=wv.key_to_index)
wvdf.sort_values(0, ascending=False).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
chances,6.190973,-0.884685,0.1309,-3.597012,3.909777,1.380001,-1.876168,-3.580688,1.85867,-0.852451,...,-1.233898,-1.437529,1.385598,-0.891374,-2.376446,1.635361,-1.802976,0.240714,0.67824,0.609865
studios,5.800575,1.111984,0.280563,3.162533,1.477707,-0.995791,-1.747026,0.338785,-0.690987,0.305923,...,-3.896979,1.206651,-0.329749,-0.319353,3.89425,0.368445,-2.665123,0.717755,-3.042337,-0.149493
risks,5.584486,-2.027174,-0.574617,-3.894549,1.678902,-0.447972,-4.527171,-2.228954,1.6941,-3.295694,...,-0.202225,-2.852139,-1.066723,-4.874702,-1.594602,-0.594093,-4.091599,0.302917,0.890025,0.733621
years,5.546257,-2.756695,0.710377,-2.220256,0.251612,1.316917,0.209057,-1.259738,3.860453,-3.943206,...,-2.744987,0.021158,-0.326545,-0.433087,-0.302876,4.104189,-0.298957,0.943215,-2.772615,-1.226133
critics,5.461033,0.646613,0.782544,1.464622,0.613968,2.128228,-3.183236,3.162859,-0.568089,-0.712476,...,-1.038523,2.606002,0.443955,0.672234,3.590186,4.11901,0.932911,-0.603542,-2.493042,2.642019


In [27]:
wvdf.to_csv(wvdf_file)

In [28]:
# Used to read the model into R for word2vec library
wv.save_word2vec_format(binary_wv_file, binary=True)

## Inspect model

In [73]:
wv.most_similar(positive=["metoo", "sexism", "sexist", "sexual_harassment", "misogyny", "patriarchy",
                         "sexualization", "sjws", "rape_culture", "toxic_masculinity", "harassing"], topn=25)

[('feminism', 0.7308980226516724),
 ('misogynistic', 0.7071645855903625),
 ('feminist', 0.689406156539917),
 ('feminists', 0.6808596849441528),
 ('sexual_assault', 0.6804274320602417),
 ('objectification', 0.6662111282348633),
 ('bigotry', 0.6656292676925659),
 ('social_justice', 0.6590590476989746),
 ('empowerment', 0.648938775062561),
 ('women', 0.6478629112243652),
 ('pedophilia', 0.6424859166145325),
 ('misogynist', 0.6418096423149109),
 ('sjw', 0.6407530903816223),
 ('harassment', 0.639790415763855),
 ('female_empowerment', 0.6394770741462708),
 ('political_correctness', 0.6337777972221375),
 ('masculinity', 0.631800651550293),
 ('discrimination', 0.6309238076210022),
 ('anti-feminist', 0.6297603249549866),
 ('leftist', 0.6270629167556763),
 ('patriarchal', 0.6264849305152893),
 ('misandry', 0.6232162117958069),
 ('male_gaze', 0.619766891002655),
 ('cancel_culture', 0.6184059381484985),
 ('incels', 0.6143620014190674)]

In [85]:
wv.most_similar(positive=["blm", "racism","racist", "racists", "african_americans",
                         "racial", "segregation", "systemic_racism", "police_brutality", "white_supremacy",
                         "institutional_racism", "race_relations", "bigoted"], topn=25)

[('racism', 0.8096297979354858),
 ('black_americans', 0.7640103101730347),
 ('bigotry', 0.7611280679702759),
 ('minorities', 0.7565397620201111),
 ('african-americans', 0.7545771598815918),
 ('discrimination', 0.7275856137275696),
 ('black_community', 0.7166815996170044),
 ('blacks', 0.7144224643707275),
 ('white_supremacists', 0.714196503162384),
 ('xenophobia', 0.70899498462677),
 ('leftist', 0.7087920308113098),
 ('whites', 0.7047181725502014),
 ('liberals', 0.7023640871047974),
 ('africans', 0.6954412460327148),
 ('socialist', 0.6943147778511047),
 ('right_wing', 0.6939700841903687),
 ('white_liberals', 0.6910309195518494),
 ('oppression', 0.6849381327629089),
 ('colonialism', 0.6834794282913208),
 ('conservatives', 0.6773111820220947),
 ('systemic', 0.6713621616363525),
 ('nationalism', 0.6711985468864441),
 ('white_supremacist', 0.6698637008666992),
 ('kkk', 0.6683135628700256),
 ('sexism', 0.6682968735694885)]

In [86]:
wv.most_similar("discrimination", topn=25)

[('racism', 0.7057074904441833),
 ('bigotry', 0.6801912784576416),
 ('prejudice', 0.6665652990341187),
 ('segregation', 0.6649627089500427),
 ('sexism', 0.6633039712905884),
 ('minorities', 0.6296154856681824),
 ('racial', 0.6256668567657471),
 ('homophobia', 0.6131706237792969),
 ('oppression', 0.6131154894828796),
 ('institutional_racism', 0.6027073264122009),
 ('black_americans', 0.5986418724060059),
 ('misogyny', 0.5949902534484863),
 ('systemic', 0.590713381767273),
 ('white_supremacy', 0.583132803440094),
 ('inequality', 0.5828868746757507),
 ('african_americans', 0.5782977342605591),
 ('xenophobia', 0.5758996605873108),
 ('police_brutality', 0.574889600276947),
 ('equality', 0.5737311840057373),
 ('societal', 0.5684583783149719),
 ('overt_racism', 0.5674771666526794),
 ('blacks', 0.5662500262260437),
 ('systemic_racism', 0.5615476965904236),
 ('social', 0.555654764175415),
 ('slavery', 0.5534564256668091)]