In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity

PATH = 'data/'
EMB = 'embeddings/'

# W2V Evaluation on RG65

In [2]:
df = pd.read_csv(PATH+'score_df.csv').drop(['word_1_num','word_2_num'], axis=1)
df.head(3)

Unnamed: 0,word_1,word_2,human_judged,pure_bigrams,ppmi,pca_10,pca_100,pca_300
0,cord,smile,0.02,0.170389,0.078244,0.444295,0.051424,0.136932
1,rooster,voyage,0.04,0.0,0.0,0.656314,0.021271,-0.07066
2,noon,string,0.04,0.285714,0.030978,0.909385,-0.008927,0.052625


In [3]:
def compute_cos_sim(w1, w2):
    return cosine_similarity(w1.reshape(1,-1), w2.reshape(1,-1))[0][0]

In [4]:
w2v_model = KeyedVectors.load_word2vec_format(
    EMB+'GoogleNews-vectors-negative300.bin.gz',
    binary=True
)

In [5]:
df['w2w'] = df.apply(
    lambda x: compute_cos_sim(
        w2v_model[x['word_1']],
        w2v_model[x['word_2']]),
    axis=1
)
df.head(3)

Unnamed: 0,word_1,word_2,human_judged,pure_bigrams,ppmi,pca_10,pca_100,pca_300,w2w
0,cord,smile,0.02,0.170389,0.078244,0.444295,0.051424,0.136932,0.018116
1,rooster,voyage,0.04,0.0,0.0,0.656314,0.021271,-0.07066,0.062758
2,noon,string,0.04,0.285714,0.030978,0.909385,-0.008927,0.052625,0.021655


In [6]:
corr = df.corr(method='pearson').round(3)
corr_with_humans = corr['human_judged'].drop('human_judged').sort_values(ascending=False)
corr_with_humans

w2w             0.772
pca_100         0.407
pca_300         0.347
pca_10          0.180
ppmi            0.093
pure_bigrams    0.074
Name: human_judged, dtype: float64

# Semantic / Syntactic analogy test - Preprocessing

In [7]:
data = pd.read_csv(
    PATH+'word-test.v1.txt',
    sep=" ", header = None,
    skiprows=2
)
data.columns=['word_00', 'word_01', 'word_10', 'word_11']

split_ind = data[data['word_01']=='gram1-adjective-to-adverb'].index[0]
sem_analogy = data[:split_ind]
sem_analogy = sem_analogy[sem_analogy['word_00']!=':'].reset_index(drop=True)
synt_analogy = data[split_ind+1:]
synt_analogy = synt_analogy[synt_analogy['word_00']!=':'].reset_index(drop=True)

data.head(3)

Unnamed: 0,word_00,word_01,word_10,word_11
0,Athens,Greece,Baghdad,Iraq
1,Athens,Greece,Bangkok,Thailand
2,Athens,Greece,Beijing,China


In [8]:
lsa_model = KeyedVectors.load_word2vec_format(PATH+"lsa.txt")
lsa_vocab =  list(lsa_model.vocab.keys())

In [9]:
for df in [synt_analogy, sem_analogy]:
    for col in df.columns:
        mask = df[col].apply(lambda x: True if x.lower() in lsa_vocab else False)
        df = df[mask].reset_index(drop=True)

In [10]:
with open(PATH+'synt_analogy.txt', 'w') as f:
    f.write(': main\n' + synt_analogy.to_string(header=False, index=False))
    f.close()

with open(PATH+'sem_analogy.txt', 'w') as f:
    f.write(': main\n' + sem_analogy.to_string(header=False, index=False))
    f.close()

# Semantic / Syntactic analogy test - Evaluation

In [11]:
synt_lsa = lsa_model.evaluate_word_analogies(PATH+'synt_analogy.txt')[0]
sem_lsa = lsa_model.evaluate_word_analogies(PATH+'sem_analogy.txt')[0]

# calculated on another PC (not enough RAM on mine)
# sem_w2v = w2v_model.evaluate_word_analogies('sem_analogy.txt')[0]
# synt_w2v = w2v_model.evaluate_word_analogies('synt_analogy.txt')[0]
synt_w2v, sem_w2v = 0.7390163, 0.6800925

In [12]:
print(f"W2V - Semantic analogy test: {sem_w2v:.4f}")
print(f"W2V - Syntactic analogy test: {synt_w2v:.4f}\n")
print(f"LSA - Semantic analogy test: {sem_lsa:.4f}")
print(f"LSA - Syntactic analogy test: {synt_lsa:.4f}")

W2V - Semantic analogy test: 0.6801
W2V - Syntactic analogy test: 0.7390

LSA - Semantic analogy test: 0.1739
LSA - Syntactic analogy test: 0.0934
