In [19]:
import numpy as np
import pandas as pd

In [20]:
presov = np.load('bert/presov.npz')
sov = np.load('bert/sov.npz')
postsov = np.load('bert/postsov.npz')

In [3]:
! ls context-diachrony/datasets

dataset_0.csv                  dataset_1.csv
dataset_0_annotation.tsv       dataset_1_annotation.tsv
dataset_0_testset.tsv          dataset_1_testset.tsv
dataset_0_testset_filtered.tsv dataset_1_testset_filtered.tsv


In [21]:
dataset0 = pd.read_csv('context-diachrony/datasets/dataset_1_testset_filtered.tsv', sep='\t')

In [22]:
dataset0.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later,frequency_sov/postsov
0,ателье,3.29,3.3,3.19,-0.11,288/326
1,блочный,2.15,2.57,2.63,0.06,67/157
2,боевик,2.25,3.28,2.82,-0.46,231/2918
3,ботаник,2.52,3.2,2.16,-1.04,410/219
4,взлом,3.26,3.4,3.03,-0.37,99/99


In [23]:
np.random.seed(42)

In [24]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import jensenshannon as jsd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor

K = 5

## KMeans

In [25]:
def calc_kmeans(word):
    sov = np.load('bert/sov.npz')
    postsov = np.load('bert/postsov.npz')
    emb1 = sov.get(word)
    emb2 = postsov.get(word)
    if emb1.shape[0] > 10000:
        rand_idxs = np.random.choice(emb1.shape[0], 10000, replace=False)
        emb1 = emb1[rand_idxs]
    if emb2.shape[0] > 10000:
        rand_idxs = np.random.choice(emb2.shape[0], 10000, replace=False)
        emb2 = emb2[rand_idxs]

    embs = []
    embs.extend(emb1)
    embs.extend(emb2)

    kmeans = KMeans(n_clusters=K, random_state=42).fit(embs)
    dist1 = [np.count_nonzero(kmeans.labels_[:len(emb1)] == i) / len(emb1) for i in range(K)]
    dist2 = [np.count_nonzero(kmeans.labels_[len(emb1):] == i) / len(emb2) for i in range(K)]

    return {word : [dist1, dist2]}


In [26]:
from joblib import Parallel, delayed
import time

start = time.time()
results = Parallel(n_jobs=-1)(
    delayed(calc_kmeans)(word) for word in list(dataset0['word'])
)
print(time.time() - start)

49.96233582496643


In [27]:
kmeans_jsd = []
for item in results:
    for k, v in item.items():
        kmeans_jsd.append(jsd(v[0], v[1]))
        
print(spearmanr(kmeans_jsd, dataset0['COMPARE']))
print(spearmanr(kmeans_jsd, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.48468256531816667, pvalue=0.00031306239747392104)
SpearmanrResult(correlation=0.1309041605363084, pvalue=0.3598687404381328)


In [28]:
kmeans_max = []
for item in results:
    for k, v in item.items():
        kmeans_max.append(np.max(np.square(v[0]) - np.square(v[1])))
        
print(spearmanr(kmeans_max, dataset0['COMPARE']))
print(spearmanr(kmeans_max, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.4181637182434112, pvalue=0.0022618490123113427)
SpearmanrResult(correlation=0.12329976946780916, pvalue=0.38868594453138416)


## AffinityPropagation 

In [33]:
affinity_jsd = {}
with open('aff_jsd_elmo2.txt') as f: # sort by word in file
    for line in f.readlines():
        affinity_jsd[line.split('=')[0]] = line[:-1].split('=')[1]

affinity_jsd_list = []
for idx, row in dataset0.iterrows():
    affinity_jsd_list.append(affinity_jsd[row['word']])

In [34]:
print(spearmanr(affinity_jsd_list, dataset0['COMPARE']))
print(spearmanr(affinity_jsd_list, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.36272291714212324, pvalue=0.008899484595937337)
SpearmanrResult(correlation=0.20461799791510651, pvalue=0.14977464031870152)


## Cosine

In [31]:
def calc_cosine(words, period1, period2):
    X = []
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X.append(cosine(np.mean(emb1, axis=0), np.mean(emb2, axis=0)))
    return X

In [32]:
x_cosine = calc_cosine(dataset0['word'], sov, postsov)

print(spearmanr(x_cosine, dataset0['COMPARE']))
print(spearmanr(x_cosine, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.5007013897157608, pvalue=0.00018259682147381674)
SpearmanrResult(correlation=0.23283921224023868, pvalue=0.10012085755120885)


## Freq1, Freq2, Freq1 / Freq2

In [37]:
def calc_freq(words, X, period1, period2):
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X[idx].extend([len(emb1), len(emb2), len(emb1) / len(emb2)])

In [55]:
def run_cv(X, y):
    kf = KFold(n_splits=5)
    test_scores = []

    for train_idx, test_idx in kf.split(X):
        x_train, x_test = np.array(X)[train_idx], np.array(X)[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LinearRegression()
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        test_scores.append(spearmanr(pred, y_test))
        print('Spearman correlation:', test_scores[-1])

    print('Mean score is:', np.mean([np.abs(score[0]) for score in test_scores]))

## Посчитаем корреляцию модели, обученной на частотах слов

In [76]:
X = [[] for _ in range(dataset0.shape[0])]

calc_freq(dataset0['word'], X, sov, postsov)

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.20909090909090913, pvalue=0.5372209352113229)
Spearman correlation: SpearmanrResult(correlation=0.09090909090909088, pvalue=0.8027717312071619)
Spearman correlation: SpearmanrResult(correlation=-0.24848484848484845, pvalue=0.48877630451924314)
Spearman correlation: SpearmanrResult(correlation=-0.7538028741632801, pvalue=0.011794786289983676)
Spearman correlation: SpearmanrResult(correlation=-0.6363636363636362, pvalue=0.04791172612997547)
Mean score is: 0.387730271802353


In [77]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=0.07272727272727274, pvalue=0.831716405381337)
Spearman correlation: SpearmanrResult(correlation=-0.3719581343246535, pvalue=0.28987658207732536)
Spearman correlation: SpearmanrResult(correlation=0.01818181818181818, pvalue=0.9602404181286243)
Spearman correlation: SpearmanrResult(correlation=-0.5030303030303029, pvalue=0.13833369839449197)
Spearman correlation: SpearmanrResult(correlation=0.17575757575757575, pvalue=0.6271883447764844)
Mean score is: 0.22833102080432463


## Добавим cosine

In [78]:
for i in range(dataset0.shape[0]):
    X[i].append(x_cosine[i])

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.16363636363636366, pvalue=0.6306852146425357)
Spearman correlation: SpearmanrResult(correlation=0.23636363636363633, pvalue=0.5108853175152002)
Spearman correlation: SpearmanrResult(correlation=-0.06666666666666665, pvalue=0.8548130882487426)
Spearman correlation: SpearmanrResult(correlation=0.21276694028802262, pvalue=0.5550759959985581)
Spearman correlation: SpearmanrResult(correlation=-0.5393939393939393, pvalue=0.1075931877824148)
Mean score is: 0.24376550926972573


In [79]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=0.11818181818181818, pvalue=0.72928477951978)
Spearman correlation: SpearmanrResult(correlation=-0.5183023183212385, pvalue=0.12483849367734694)
Spearman correlation: SpearmanrResult(correlation=0.05454545454545454, pvalue=0.8810361811618526)
Spearman correlation: SpearmanrResult(correlation=-0.32121212121212117, pvalue=0.3654683104386702)
Spearman correlation: SpearmanrResult(correlation=0.10303030303030303, pvalue=0.7769984634438898)
Mean score is: 0.22305440305818708


## KMeans & JSD + Cosine

In [80]:
X = [[] for _ in range(dataset0.shape[0])]

for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])
    X[i].append(x_cosine[i])
    
run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.14545454545454548, pvalue=0.6695786456420787)
Spearman correlation: SpearmanrResult(correlation=0.406060606060606, pvalue=0.24428229408662638)
Spearman correlation: SpearmanrResult(correlation=0.41818181818181815, pvalue=0.22911284098281892)
Spearman correlation: SpearmanrResult(correlation=0.7234075969792769, pvalue=0.018047748354494193)
Spearman correlation: SpearmanrResult(correlation=0.4424242424242424, pvalue=0.20042268671194224)
Mean score is: 0.42710576182009785


In [81]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=0.09090909090909091, pvalue=0.7903727377532903)
Spearman correlation: SpearmanrResult(correlation=-0.597572084652722, pvalue=0.06808978684116407)
Spearman correlation: SpearmanrResult(correlation=-0.006060606060606061, pvalue=0.9867429111949892)
Spearman correlation: SpearmanrResult(correlation=-0.33333333333333326, pvalue=0.34659350708733405)
Spearman correlation: SpearmanrResult(correlation=-0.29696969696969694, pvalue=0.4047016712701569)
Mean score is: 0.2649689623850898


## KMeans & JSD + Cosine + Freq

In [84]:
X = [[] for _ in range(dataset0.shape[0])]

calc_freq(dataset0['word'], X, sov, postsov)

for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])
    X[i].append(x_cosine[i])

run_cv(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=0.18181818181818182, pvalue=0.5926152128455)
Spearman correlation: SpearmanrResult(correlation=0.1515151515151515, pvalue=0.6760651759978538)
Spearman correlation: SpearmanrResult(correlation=0.16363636363636364, pvalue=0.6514773427962428)
Spearman correlation: SpearmanrResult(correlation=0.498482545817653, pvalue=0.14251791767625296)
Spearman correlation: SpearmanrResult(correlation=-0.5393939393939393, pvalue=0.1075931877824148)
Mean score is: 0.30696923643625784


In [85]:
run_cv(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Spearman correlation: SpearmanrResult(correlation=0.08181818181818183, pvalue=0.8109903943000458)
Spearman correlation: SpearmanrResult(correlation=-0.6280604563186774, pvalue=0.05184120984456854)
Spearman correlation: SpearmanrResult(correlation=0.06666666666666665, pvalue=0.8548130882487426)
Spearman correlation: SpearmanrResult(correlation=-0.7575757575757575, pvalue=0.011143446799694208)
Spearman correlation: SpearmanrResult(correlation=-0.10303030303030303, pvalue=0.7769984634438898)
Mean score is: 0.3274302730819173
