In [35]:
import numpy as np
from numpy.linalg import norm
import pandas as pd

## Датасет по семантической близости wordsim353

In [36]:
df = pd.read_csv('./WS353_All_Langs_TXT_Format/MWS353_Russian.txt')

df

Unnamed: 0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score
0,секс,любовь,8,9,7,8,8,8,6,8,8.0,6,10,9,8,7.92
1,кот,тигр,7,6,10,8,2,7,8,7,8.0,5,4,7,8,6.69
2,тигр,тигр,10,10,10,10,10,10,10,10,10.0,10,10,10,10,10.00
3,бумага,книга,7,8,8,7,9,7,4,5,7.0,7,8,8,7,7.08
4,клавиатура,компьютер,8,6,10,4,5,6,3,8,7.0,8,8,9,9,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,наводнение,ливень,8,9,8,8,5,9,8,9,7.0,5,9,8,9,7.85
346,прогноз,погода,8,10,10,8,10,9,9,9,7.0,10,10,9,9,9.08
347,область,катастрофа,6,0,4,8,1,6,4,4,4.0,4,0,5,5,3.92
348,офис,губернатор,6,2,2,7,9,6,6,4,6.0,8,3,6,1,5.08


In [37]:
wordsim353_triples = df[['Word1', 'Word2', 'Average Score']]
wordsim353_triples.head()

Unnamed: 0,Word1,Word2,Average Score
0,секс,любовь,7.92
1,кот,тигр,6.69
2,тигр,тигр,10.0
3,бумага,книга,7.08
4,клавиатура,компьютер,7.0


In [38]:
# составляем список слов, которые встретились, чтобы не считывать статистические модели полностью

needed_words = set()
needed_words.update(wordsim353_triples['Word1'])
needed_words.update(wordsim353_triples['Word2'])

## Статистические модели

In [39]:
def read_model(filepath):
    words = {}
    with open(filepath) as f:
        line = f.readline()
        line = f.readline()
        while line:
            split = line.split()
            w = split[0].partition('_')[0]
            if w in needed_words:
                words[w] = np.array([eval(i) for i in split[1:]])
            line = f.readline()
    return words    

In [40]:
araneum = read_model('araneum_upos_skipgram_300_2_2018.vec')
ruwikiruscorpora = read_model('ruwikiruscorpora_upos_skipgram_300_2_2018.vec')

In [49]:
def count_cos_sim(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

df_results = pd.DataFrame()
for index, row in wordsim353_triples.iterrows():
    w1 = row['Word1']
    w2 = row['Word2']
    if   w1 in araneum \
     and w1 in ruwikiruscorpora \
     and w2 in araneum \
     and w2 in ruwikiruscorpora:
        araneum_cos_sim = count_cos_sim(araneum[w1], araneum[w2])
        ruwikiruscorpora_cos_sim = count_cos_sim(ruwikiruscorpora[w1], ruwikiruscorpora[w2])
        df_results = pd.concat([df_results, pd.DataFrame([{
            'Word1': w1, 
            'Word2': w2, 
            'araneum': araneum_cos_sim, 
            'ruwikiruscorpora': ruwikiruscorpora_cos_sim,
            'people_opinion': row['Average Score']}])], ignore_index = True)

df_results.to_csv('./results.csv')

## Подсчет корреляции

In [56]:
df_results['araneum_order'] =  df_results['araneum'].rank()
df_results['ruwikiruscorpora_order'] =  df_results['ruwikiruscorpora'].rank()
df_results['people_opinion_order'] =  df_results['people_opinion'].rank()

df_results.head()

Unnamed: 0,Word1,Word2,araneum,ruwikiruscorpora,people_opinion,araneum_order,ruwikiruscorpora_order,people_opinion_order
0,секс,любовь,0.432412,0.395474,7.92,234.0,266.0,265.5
1,кот,тигр,0.515736,0.276931,6.69,261.0,179.0,186.0
2,тигр,тигр,1.0,1.0,10.0,309.0,309.0,309.0
3,бумага,книга,0.248152,0.289378,7.08,142.0,201.0,211.5
4,клавиатура,компьютер,0.534799,0.455964,7.0,264.0,285.0,207.5


In [68]:
def count_Spearman(df1, col1, col2):
    diff = df1.apply(lambda x: (x[col1] - x[col2])**2, axis=1).sum()
    p = 1 - 6 * diff / (df1.shape[0] ** 3 - df1.shape[0])
    return p

print(f'Spearman araneum - ruwikiruscorpora: {count_Spearman(df_results, "araneum_order", "ruwikiruscorpora_order")}')
print(f'Spearman araneum - people_opinion: {count_Spearman(df_results, "araneum_order", "people_opinion_order")}')
print(f'Spearman people_opinion - ruwikiruscorpora: {count_Spearman(df_results, "people_opinion_order", "ruwikiruscorpora_order")}')

Spearman araneum - ruwikiruscorpora: 0.6065606175847329
Spearman araneum - people_opinion: 0.652842764814265
Spearman people_opinion - ruwikiruscorpora: 0.4074288927483416
