In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import spearmanr

In [2]:
glove_orig = '../data/embeddings/glove.42B.300d.txt'

In [3]:
import pandas as pd
import csv

words = pd.read_table(glove_orig, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [4]:
def vec(w):
    return words.loc[w].as_matrix()

In [5]:
def cos_sim(a, b):
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [6]:
def measure_emb_correlation(df, all_emb_words, P):
    df = df.loc[df['word1'].isin(all_emb_words)]
    df = df.loc[df['word2'].isin(all_emb_words)]
    
    rel_words = list(set(df['word1'].tolist() + df['word2'].tolist()))
    
    words_glove_vec = {}
    for word in rel_words:
        words_glove_vec[word] = vec(word)
        
    glove_sim = []
    proj_glove_sim = []
    for _, row in df.iterrows():
        w1, w2 = row['word1'], row['word2']
        sim = cos_sim(words_glove_vec[w1], words_glove_vec[w2])
        glove_sim.append(sim)

        p_sim = cos_sim(P.dot(words_glove_vec[w1]), P.dot(words_glove_vec[w2]))
        proj_glove_sim.append(p_sim)
    
    df['glove_sim'] = np.array(glove_sim)
    df['proj_glove_sim'] = np.array(proj_glove_sim)
    
    print('glove:', spearmanr(df['similarity'], df['glove_sim']))
    print('glove -P:', spearmanr(df['similarity'], df['proj_glove_sim']))
    return df

In [7]:
P = np.load('P.glove.dim=300.iters=35.npy')

In [8]:
all_glove_words = words.index.tolist()

# Simlex

Data from: https://fh295.github.io/simlex.html

@article{hill2015simlex,
  title={Simlex-999: Evaluating semantic models with (genuine) similarity estimation},
  author={Hill, Felix and Reichart, Roi and Korhonen, Anna},
  journal={Computational Linguistics},
  volume={41},
  number={4},
  pages={665--695},
  year={2015},
  publisher={MIT Press}
}

In [9]:
df_simlex = pd.read_csv('SimLex-999.txt', sep='\t')

In [10]:
df_simlex = df_simlex.rename(columns={"SimLex999": "similarity"})

In [11]:
df_simlex.head()

Unnamed: 0,word1,word2,POS,similarity,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


In [12]:
df_simlex = measure_emb_correlation(df_simlex, all_glove_words, P)

  


glove: SpearmanrResult(correlation=0.3737987757655539, pvalue=1.752960783934323e-34)
glove -P: SpearmanrResult(correlation=0.48991872475777937, pvalue=1.9533070950031485e-61)


### WordSim 353

source: http://alfonseca.org/eng/research/wordsim353.html

@inproceedings{agirre2009study,
  title={A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches},
  author={Agirre, Eneko and Alfonseca, Enrique and Hall, Keith and Kravalov{\'a}, Jana and Pasca, Marius and Soroa, Aitor},
  booktitle={Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
  pages={19--27},
  year={2009}
}

In [13]:
df_353_sim = pd.read_csv('wordsim_similarity_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])

In [14]:
df_353_sim = measure_emb_correlation(df_353_sim, all_glove_words, P)

  


glove: SpearmanrResult(correlation=0.6953847581116936, pvalue=1.1908298243958436e-29)
glove -P: SpearmanrResult(correlation=0.799024897742935, pvalue=9.93294756041503e-45)


In [15]:
df_353_rel = pd.read_csv('wordsim_relatedness_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])

In [16]:
df_353_rel = measure_emb_correlation(df_353_rel, all_glove_words, P)

  


glove: SpearmanrResult(correlation=0.5999016814083603, pvalue=1.1815135039496508e-24)
glove -P: SpearmanrResult(correlation=0.6980429477082863, pvalue=4.332277723661555e-36)


### Mturk

source: http://www2.mta.ac.il/~gideon/mturk771.html

@inproceedings{halawi2012large,
  title={Large-scale learning of word relatedness with constraints},
  author={Halawi, Guy and Dror, Gideon and Gabrilovich, Evgeniy and Koren, Yehuda},
  booktitle={Proceedings of the 18th ACM SIGKDD international conference on Knowledge discovery and data mining},
  pages={1406--1414},
  year={2012}
}

In [17]:
df_mturk = pd.read_csv('MTURK-771.csv', names=['word1', 'word2', 'similarity'])

In [18]:
df_mturk = measure_emb_correlation(df_mturk, all_glove_words, P)

  


glove: SpearmanrResult(correlation=0.684252286603461, pvalue=1.4888186861090302e-107)
glove -P: SpearmanrResult(correlation=0.7280300925805863, pvalue=3.2205628584718986e-128)


### PCA

In [19]:
X = words.to_numpy()

In [20]:
X.shape

(1917494, 300)

In [21]:
from sklearn.decomposition import PCA

pca = PCA(n_components=195)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=195, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [31]:
pca_sim = []
for _, row in df_simlex.iterrows():
    w1, w2 = row['word1'], row['word2']
    
    sim = cos_sim(pca.transform([vec(w1)])[0], pca.transform([vec(w2)])[0])
    pca_sim.append(sim)

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  


In [32]:
df_simlex['pca_sim'] = np.array(pca_sim)

In [33]:
spearmanr(df_simlex['similarity'], df_simlex['pca_sim'])

SpearmanrResult(correlation=0.322396858158789, pvalue=1.3505991866749592e-25)

### Most affected examples

In [34]:
df_simlex['abs_diff'] = (df_simlex['glove_sim'] - df_simlex['proj_glove_sim']).abs()

In [37]:
df_simlex.nlargest(20, 'abs_diff')[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
596,action,course,5.45,0.466762,-0.087191
546,people,party,2.2,0.496346,-0.045304
270,strength,might,7.07,0.453564,-0.068027
78,easy,big,1.12,0.52767,0.008003
601,home,state,2.58,0.501742,-0.017328
869,get,put,1.98,0.775637,0.259433
915,get,remain,1.6,0.498241,0.003791
595,father,god,3.57,0.643352,0.160305
330,man,father,4.83,0.614493,0.133659
60,different,normal,1.08,0.605423,0.126056


In [38]:
df_simlex.nlargest(10, 'proj_glove_sim')[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
234,attorney,lawyer,9.35,0.889351,0.866089
8,stupid,dumb,9.58,0.902445,0.849372
125,woman,man,3.33,0.804799,0.818703
782,vanish,disappear,9.8,0.797108,0.81763
115,south,north,2.2,0.892447,0.801304
9,weird,strange,8.93,0.855492,0.796055
111,wife,husband,2.3,0.886078,0.793236
127,actress,actor,7.12,0.783117,0.787986
851,analyze,evaluate,8.03,0.812894,0.765822
182,employer,employee,3.65,0.802433,0.763729


In [39]:
df_simlex.sample(10)[['word1', 'word2', 'similarity', 'glove_sim', 'proj_glove_sim']]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
148,arm,shoulder,4.85,0.623839,0.463336
145,bed,bedroom,3.4,0.724843,0.574879
791,carry,bring,5.8,0.666504,0.288778
250,motor,engine,8.65,0.60932,0.457481
130,bird,hawk,7.85,0.538856,0.355232
521,pot,appliance,2.53,0.224095,0.09724
249,rat,mouse,7.78,0.631915,0.515704
997,absorb,withdraw,2.97,0.263449,0.05563
653,cup,jar,5.13,0.42972,0.291241
22,quick,rapid,9.7,0.611912,0.480641


## Rawspace

In [42]:
rawspace_glove_sim = []
I = np.eye(300)
for _, row in df_simlex.iterrows():
    w1, w2 = row['word1'], row['word2']
    
    p_sim = cos_sim((I - P).dot(vec(w1)), (I - P).dot(vec(w2)))
    rawspace_glove_sim.append(p_sim)

  


In [43]:
spearmanr(df_simlex['similarity'], rawspace_glove_sim)

SpearmanrResult(correlation=0.19625411667628184, pvalue=3.9455540411996796e-10)

In [None]:
all_vecs = []
for k, vec in words_glove_vec.items():
    all_vecs.append(vec)

In [None]:
all_vecs = np.array(all_vecs)

In [None]:
all_vecs.shape

In [None]:
temp = (I - P).dot(all_vecs.T).T

In [None]:
np.linalg.matrix_rank(temp)

In [None]:
np.linalg.matrix_rank(all_vecs)