In [86]:
import numpy as np
import pandas as pd

In [87]:
presov = np.load('bert/presov.npz')
sov = np.load('bert/sov.npz')
postsov = np.load('bert/postsov.npz')

In [88]:
! ls context-diachrony/datasets

dataset_0.csv                  dataset_1.csv
dataset_0_annotation.tsv       dataset_1_annotation.tsv
dataset_0_testset.tsv          dataset_1_testset.tsv
dataset_0_testset_filtered.tsv dataset_1_testset_filtered.tsv


In [89]:
dataset0 = pd.read_csv('context-diachrony/datasets/dataset_0_testset_filtered.tsv', sep='\t')

In [90]:
dataset0.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later,frequency_presov/sov
0,агентство,3.15,3.62,3.55,-0.07,842/333
1,богадельня,3.65,3.3,3.29,-0.01,442/190
2,больница,3.86,3.71,3.92,0.21,3337/6597
3,весна,3.58,3.55,3.6,0.05,5729/10250
4,вино,3.37,3.68,3.77,0.09,6499/6919


In [92]:
np.random.seed(42)

In [93]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import jensenshannon as jsd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor

K = 5

## KMeans

In [94]:
def calc_kmeans(word):
    presov = np.load('bert/presov.npz')
    sov = np.load('bert/sov.npz')
    emb1 = presov.get(word)
    emb2 = sov.get(word)
    if emb1.shape[0] > 10000:
        rand_idxs = np.random.choice(emb1.shape[0], 10000, replace=False)
        emb1 = emb1[rand_idxs]
    if emb2.shape[0] > 10000:
        rand_idxs = np.random.choice(emb2.shape[0], 10000, replace=False)
        emb2 = emb2[rand_idxs]

    embs = []
    embs.extend(emb1)
    embs.extend(emb2)

    kmeans = KMeans(n_clusters=K, random_state=42).fit(embs)
    dist1 = [np.count_nonzero(kmeans.labels_[:len(emb1)] == i) / len(emb1) for i in range(K)]
    dist2 = [np.count_nonzero(kmeans.labels_[len(emb1):] == i) / len(emb2) for i in range(K)]

    return {word : [dist1, dist2]}


In [95]:
from joblib import Parallel, delayed

start = time.time()
results = Parallel(n_jobs=-1)(
    delayed(calc_kmeans)(word) for word in list(dataset0['word'])
)
print(time.time() - start)

86.994136095047


In [96]:
kmeans_jsd = []
for item in results:
    for k, v in item.items():
        kmeans_jsd.append(jsd(v[0], v[1]))
        
print(spearmanr(kmeans_jsd, dataset0['COMPARE']))
print(spearmanr(kmeans_jsd, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.4428823464266702, pvalue=0.0016200021728544215)
SpearmanrResult(correlation=0.27215698052343307, pvalue=0.06129636307696684)


In [97]:
kmeans_max = []
for item in results:
    for k, v in item.items():
        kmeans_max.append(np.max(np.square(v[0]) - np.square(v[1])))
        
print(spearmanr(kmeans_max, dataset0['COMPARE']))
print(spearmanr(kmeans_max, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.357986159417072, pvalue=0.01248155948250658)
SpearmanrResult(correlation=0.2418992084372949, pvalue=0.09763019213516179)


## AffinityPropagation 

In [109]:
affinity_jsd = {}
with open('aff_jsd1.txt') as f: # sort by word in file
    for line in f.readlines():
        affinity_jsd[line.split('=')[0]] = line[:-1].split('=')[1]

affinity_jsd_list = []
for idx, row in dataset0.iterrows():
    affinity_jsd_list.append(affinity_jsd[row['word']])

In [110]:
print(spearmanr(affinity_jsd_list, dataset0['COMPARE']))
print(spearmanr(affinity_jsd_list, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.29513862715535094, pvalue=0.04170091571689449)
SpearmanrResult(correlation=0.15977426817013188, pvalue=0.27802939471531013)


## Cosine

In [101]:
def calc_cosine(words, period1, period2):
    X = []
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X.append(cosine(np.mean(emb1, axis=0), np.mean(emb2, axis=0)))
    return X

In [102]:
x_cosine = calc_cosine(dataset0['word'], presov, sov)

print(spearmanr(x_cosine, dataset0['COMPARE']))
print(spearmanr(x_cosine, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.4921156211182979, pvalue=0.00038135437526143166)
SpearmanrResult(correlation=0.3456013393393376, pvalue=0.01613124697875543)


## Freq1, Freq2, Freq1 / Freq2

In [70]:
def calc_freq(words, X, period1, period2):
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X[idx].extend([len(emb1), len(emb2), len(emb1) / len(emb2)])

In [123]:
def run_cv(X, y):
    kf = KFold(n_splits=5)
    test_scores = []

    for train_idx, test_idx in kf.split(X):
        x_train, x_test = np.array(X)[train_idx], np.array(X)[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LinearRegression()
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        test_scores.append(spearmanr(pred, y_test))
        print('Spearman correlation:', test_scores[-1])

    print('Mean score is:', np.mean([np.abs(score[0]) for score in test_scores]))

## Посчитаем корреляцию модели, обученной на частотах слов

In [124]:
X = [[] for _ in range(dataset0.shape[0])]

calc_freq(dataset0['word'], X, presov, sov)

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=-0.17575757575757575, pvalue=0.6271883447764844)
Spearman correlation: SpearmanrResult(correlation=-0.19999999999999998, pvalue=0.5795840000000001)
Spearman correlation: SpearmanrResult(correlation=0.2606060606060606, pvalue=0.4670890543863402)
Spearman correlation: SpearmanrResult(correlation=0.08333333333333334, pvalue=0.831214097710308)
Spearman correlation: SpearmanrResult(correlation=0.41666666666666663, pvalue=0.26458604674749614)
Mean score is: 0.22727272727272724


In [125]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=-0.28659069365997897, pvalue=0.42208631336590574)
Spearman correlation: SpearmanrResult(correlation=0.042948018339789416, pvalue=0.9062243097567702)
Spearman correlation: SpearmanrResult(correlation=-0.06686960980480709, pvalue=0.8543750609172778)
Spearman correlation: SpearmanrResult(correlation=0.33333333333333337, pvalue=0.3807131816768634)
Spearman correlation: SpearmanrResult(correlation=0.08333333333333334, pvalue=0.831214097710308)
Mean score is: 0.16261499769424842


## Добавим cosine

In [126]:
for i in range(dataset0.shape[0]):
    X[i].append(x_cosine[i])

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.6969696969696969, pvalue=0.02509667588225183)
Spearman correlation: SpearmanrResult(correlation=-0.006060606060606061, pvalue=0.9867429111949892)
Spearman correlation: SpearmanrResult(correlation=0.6484848484848483, pvalue=0.04254012768448945)
Spearman correlation: SpearmanrResult(correlation=0.0, pvalue=1.0)
Spearman correlation: SpearmanrResult(correlation=0.48333333333333334, pvalue=0.18746985521554207)
Mean score is: 0.36696969696969695


In [127]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=-0.024390697332764166, pvalue=0.9466770792643007)
Spearman correlation: SpearmanrResult(correlation=0.07976060548818034, pvalue=0.8266294205655288)
Spearman correlation: SpearmanrResult(correlation=0.5227987675648555, pvalue=0.12102739408380091)
Spearman correlation: SpearmanrResult(correlation=0.6, pvalue=0.0876228290414024)
Spearman correlation: SpearmanrResult(correlation=0.016666666666666666, pvalue=0.9660548039946861)
Mean score is: 0.24872334741049335


## KMeans & JSD + Cosine

In [128]:
X = [[] for _ in range(dataset0.shape[0])]

for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])
    X[i].append(x_cosine[i])
    
run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.5515151515151515, pvalue=0.09840117666963498)
Spearman correlation: SpearmanrResult(correlation=-0.13939393939393938, pvalue=0.7009318849100584)
Spearman correlation: SpearmanrResult(correlation=0.4424242424242424, pvalue=0.20042268671194224)
Spearman correlation: SpearmanrResult(correlation=0.6166666666666666, pvalue=0.07692880993776309)
Spearman correlation: SpearmanrResult(correlation=0.016666666666666666, pvalue=0.9660548039946861)
Mean score is: 0.35333333333333333


In [129]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=-0.03658604599914625, pvalue=0.9200750642705431)
Spearman correlation: SpearmanrResult(correlation=0.12884405501936821, pvalue=0.7227860826113411)
Spearman correlation: SpearmanrResult(correlation=0.47416632407045034, pvalue=0.16619192808155497)
Spearman correlation: SpearmanrResult(correlation=0.5666666666666667, pvalue=0.11163298761149111)
Spearman correlation: SpearmanrResult(correlation=-0.2333333333333333, pvalue=0.5456987782182727)
Mean score is: 0.28791928501779296


## KMeans & JSD + Cosine + Freq

In [130]:
X = [[] for _ in range(dataset0.shape[0])]

calc_freq(dataset0['word'], X, sov, postsov)

for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])
    X[i].append(x_cosine[i])

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.35757575757575755, pvalue=0.3103760917056799)
Spearman correlation: SpearmanrResult(correlation=0.006060606060606061, pvalue=0.9867429111949892)
Spearman correlation: SpearmanrResult(correlation=0.4787878787878787, pvalue=0.1615229280174558)
Spearman correlation: SpearmanrResult(correlation=0.6666666666666667, pvalue=0.04986723056888511)
Spearman correlation: SpearmanrResult(correlation=0.18333333333333335, pvalue=0.6368198117628943)
Mean score is: 0.3384848484848485


In [131]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=-0.3231767396591252, pvalue=0.36237497228677046)
Spearman correlation: SpearmanrResult(correlation=0.06748974310538336, pvalue=0.8530368033430833)
Spearman correlation: SpearmanrResult(correlation=0.498482545817653, pvalue=0.14251791767625296)
Spearman correlation: SpearmanrResult(correlation=0.7166666666666667, pvalue=0.029818035695845287)
Spearman correlation: SpearmanrResult(correlation=0.0, pvalue=1.0)
Mean score is: 0.3211631390497656
