In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import spearmanr

In [2]:
glove_orig = '../data/embeddings/glove.42B.300d.zip'

In [3]:
import pandas as pd
import csv
# words = pd.read_csv(glove_orig, index_col=0, header=None, quoting=csv.QUOTE_NONE)
words = pd.read_table(glove_orig, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [4]:
def vec(w):
    return words.loc[w].values

In [5]:
def cos_sim(a, b):
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [6]:
def measure_emb_correlation(df, all_emb_words, P):
    df = df.loc[df['word1'].isin(all_emb_words)]
    df = df.loc[df['word2'].isin(all_emb_words)]
    
    rel_words = list(set(df['word1'].tolist() + df['word2'].tolist()))
    
    words_glove_vec = {}
    for word in rel_words:
        words_glove_vec[word] = vec(word)
        
    glove_sim = []
    proj_glove_sim = []
    for _, row in df.iterrows():
        w1, w2 = row['word1'], row['word2']
        sim = cos_sim(words_glove_vec[w1], words_glove_vec[w2])
        glove_sim.append(sim)

        p_sim = cos_sim(P.dot(words_glove_vec[w1].T), P.dot(words_glove_vec[w2].T))
        proj_glove_sim.append(p_sim)
    
    df['glove_sim'] = np.array(glove_sim)
    df['proj_glove_sim'] = np.array(proj_glove_sim)
    
    print('glove:', spearmanr(df['similarity'], df['glove_sim']))
    print('glove -P:', spearmanr(df['similarity'], df['proj_glove_sim']))
    return df

In [7]:
all_glove_words = words.index.tolist()

In [8]:
saved_model = np.load("../data/saved_models/general/USV.npz")
u = saved_model['u']
s = saved_model['s']
vh = saved_model['vh']
u_r = u[:, 2:]
P = u_r @ u_r.T

# Simlex

Data from: https://fh295.github.io/simlex.html

@article{hill2015simlex,
  title={Simlex-999: Evaluating semantic models with (genuine) similarity estimation},
  author={Hill, Felix and Reichart, Roi and Korhonen, Anna},
  journal={Computational Linguistics},
  volume={41},
  number={4},
  pages={665--695},
  year={2015},
  publisher={MIT Press}
}

In [13]:
df_simlex = pd.read_csv('SimLex-999.txt', sep='\t')

In [14]:
df_simlex = df_simlex.rename(columns={"SimLex999": "similarity"})

In [15]:
df_simlex.head()

Unnamed: 0,word1,word2,POS,similarity,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


In [16]:
df_simlex = measure_emb_correlation(df_simlex, all_glove_words, P)

glove: SpearmanrResult(correlation=0.3737987757655539, pvalue=1.752960783934323e-34)
glove -P: SpearmanrResult(correlation=0.39089092019034416, pvalue=8.130936653229414e-38)


### WordSim 353

source: http://alfonseca.org/eng/research/wordsim353.html

@inproceedings{agirre2009study,
  title={A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches},
  author={Agirre, Eneko and Alfonseca, Enrique and Hall, Keith and Kravalov{\'a}, Jana and Pasca, Marius and Soroa, Aitor},
  booktitle={Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  pages={19--27},
  year={2009}
}

In [13]:
df_353_sim = pd.read_csv('wordsim_similarity_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])

In [14]:
df_353_sim = measure_emb_correlation(df_353_sim, all_glove_words, P)

glove: SpearmanrResult(correlation=0.6953847581116936, pvalue=1.1908298243958436e-29)
glove -P: SpearmanrResult(correlation=0.7002491155403305, pvalue=3.276705177565873e-30)


In [15]:
df_353_rel = pd.read_csv('wordsim_relatedness_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])

In [16]:
df_353_rel = measure_emb_correlation(df_353_rel, all_glove_words, P)

glove: SpearmanrResult(correlation=0.5999016814083603, pvalue=1.1815135039496508e-24)
glove -P: SpearmanrResult(correlation=0.6049124268290367, pvalue=3.830895178481072e-25)


### Mturk

source: http://www2.mta.ac.il/~gideon/mturk771.html

@inproceedings{halawi2012large,
  title={Large-scale learning of word relatedness with constraints},
  author={Halawi, Guy and Dror, Gideon and Gabrilovich, Evgeniy and Koren, Yehuda},
  booktitle={Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining},
  pages={1406--1414},
  year={2012}
}

In [17]:
df_mturk = pd.read_csv('MTURK-771.csv', names=['word1', 'word2', 'similarity'])

In [18]:
df_mturk = measure_emb_correlation(df_mturk, all_glove_words, P)

glove: SpearmanrResult(correlation=0.684252286603461, pvalue=1.4888186861090302e-107)
glove -P: SpearmanrResult(correlation=0.697744654653516, pvalue=1.6005660022755625e-113)


### PCA

In [9]:
X = words.to_numpy()

In [10]:
X.shape

(1917494, 300)

In [11]:
from sklearn.decomposition import PCA

pca = PCA(n_components=195)
pca.fit(X)

PCA(n_components=195)

In [17]:
pca_sim = []
for _, row in df_simlex.iterrows():
    w1, w2 = row['word1'], row['word2']
    
    sim = cos_sim(pca.transform([vec(w1)])[0], pca.transform([vec(w2)])[0])
    pca_sim.append(sim)

In [18]:
df_simlex['pca_sim'] = np.array(pca_sim)

In [19]:
spearmanr(df_simlex['similarity'], df_simlex['pca_sim'])

SpearmanrResult(correlation=0.31444164136196745, pvalue=2.3001643227649338e-24)

### Most affected examples

In [20]:
df_simlex['abs_diff'] = (df_simlex['glove_sim'] - df_simlex['proj_glove_sim']).abs()

In [21]:
df_simlex.nlargest(20, 'abs_diff')[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
446,man,victor,1.9,0.422428,0.30523
330,man,father,4.83,0.614493,0.499174
515,rabbi,minister,7.62,0.353535,0.243533
595,father,god,3.57,0.643352,0.539941
77,happy,young,2.0,0.511416,0.40817
431,god,priest,4.5,0.498079,0.400399
279,man,uncle,3.92,0.488005,0.393521
614,politician,president,7.38,0.465059,0.372126
642,baby,daughter,5.0,0.584546,0.492868
694,girl,maid,2.93,0.524451,0.434312


In [22]:
df_simlex.nlargest(10, 'proj_glove_sim')[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
8,stupid,dumb,9.58,0.902445,0.896271
115,south,north,2.2,0.892447,0.888085
234,attorney,lawyer,9.35,0.889351,0.881147
111,wife,husband,2.3,0.886078,0.857401
125,woman,man,3.33,0.804799,0.853317
9,weird,strange,8.93,0.855492,0.852946
813,understand,know,7.47,0.849868,0.84667
142,winter,summer,2.38,0.84166,0.835611
127,actress,actor,7.12,0.783117,0.82964
138,sunset,sunrise,2.47,0.825414,0.823947


In [23]:
df_simlex.sample(10)[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
571,happiness,luck,2.38,0.507767,0.47023
783,multiply,divide,1.75,0.476453,0.473496
325,worker,employer,5.37,0.61462,0.605656
451,chicken,rice,1.43,0.637057,0.633029
647,cup,cone,3.17,0.298568,0.286264
880,achieve,try,4.42,0.511269,0.496111
854,compare,analyze,8.1,0.507889,0.500988
731,mouse,management,0.48,0.252276,0.246741
593,activity,movement,7.15,0.448239,0.432008
669,bed,chair,3.5,0.545402,0.527672


## Rawspace

In [24]:
rawspace_glove_sim = []
I = np.eye(300)
for _, row in df_simlex.iterrows():
    w1, w2 = row['word1'], row['word2']
    
    p_sim = cos_sim((I - P).dot(vec(w1)), (I - P).dot(vec(w2)))
    rawspace_glove_sim.append(p_sim)

In [25]:
spearmanr(df_simlex['similarity'], rawspace_glove_sim)

SpearmanrResult(correlation=0.018727572202160418, pvalue=0.5543646972332482)

In [None]:
all_vecs = []
for k, vec in words_glove_vec.items():
    all_vecs.append(vec)

In [None]:
all_vecs = np.array(all_vecs)

In [None]:
all_vecs.shape

In [None]:
temp = (I - P).dot(all_vecs.T).T

In [None]:
np.linalg.matrix_rank(temp)

In [None]:
np.linalg.matrix_rank(all_vecs)