In [1]:
import numpy as np
import pandas as pd

In [2]:
presov = np.load('bert/presov.npz')
sov = np.load('bert/sov.npz')
postsov = np.load('bert/postsov.npz')

In [3]:
! ls context-diachrony/datasets

dataset_0.csv                  dataset_1.csv
dataset_0_annotation.tsv       dataset_1_annotation.tsv
dataset_0_testset.tsv          dataset_1_testset.tsv
dataset_0_testset_filtered.tsv dataset_1_testset_filtered.tsv


In [10]:
dataset0 = pd.read_csv('context-diachrony/datasets/dataset_1_testset.tsv', sep='\t')

In [11]:
dataset0.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,ателье,3.29,3.3,3.19,-0.11
1,бескомпромиссность,3.84,3.76,3.84,0.08
2,беспредел,3.55,3.1,3.25,0.15
3,благотворительность,3.58,3.46,3.69,0.23
4,блочный,2.15,2.57,2.63,0.06


In [6]:
np.random.seed(42)

In [7]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import jensenshannon as jsd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor

K = 5

## KMeans

In [8]:
def calc_kmeans(word):
    sov = np.load('bert/sov.npz')
    postsov = np.load('bert/postsov.npz')
    emb1 = sov.get(word)
    emb2 = postsov.get(word)
    if emb1.shape[0] > 10000:
        rand_idxs = np.random.choice(emb1.shape[0], 10000, replace=False)
        emb1 = emb1[rand_idxs]
    if emb2.shape[0] > 10000:
        rand_idxs = np.random.choice(emb2.shape[0], 10000, replace=False)
        emb2 = emb2[rand_idxs]

    embs = []
    embs.extend(emb1)
    embs.extend(emb2)

    kmeans = KMeans(n_clusters=K, random_state=42).fit(embs)
    dist1 = [np.count_nonzero(kmeans.labels_[:len(emb1)] == i) / len(emb1) for i in range(K)]
    dist2 = [np.count_nonzero(kmeans.labels_[len(emb1):] == i) / len(emb2) for i in range(K)]

    return {word : [dist1, dist2]}


In [13]:
from joblib import Parallel, delayed
import time

start = time.time()
results = Parallel(n_jobs=-1)(
    delayed(calc_kmeans)(word) for word in list(dataset0['word'])
)
print(time.time() - start)

72.6599748134613


In [14]:
kmeans_jsd = []
for item in results:
    for k, v in item.items():
        kmeans_jsd.append(jsd(v[0], v[1]))
        
print(spearmanr(kmeans_jsd, dataset0['COMPARE']))
print(spearmanr(kmeans_jsd, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.3712845064677236, pvalue=0.001684487148800321)
SpearmanrResult(correlation=0.18751485935065723, pvalue=0.12285976017400153)


In [15]:
kmeans_max = []
for item in results:
    for k, v in item.items():
        kmeans_max.append(np.max(np.square(v[0]) - np.square(v[1])))
        
print(spearmanr(kmeans_max, dataset0['COMPARE']))
print(spearmanr(kmeans_max, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.29210588859481534, pvalue=0.014872906867211972)
SpearmanrResult(correlation=0.1425595378417773, pvalue=0.24259125890014496)


## AffinityPropagation 

In [16]:
affinity_jsd = []
with open('aff_jsd2.txt') as f: # sort by word in file
    for line in f.readlines():
        affinity_jsd.append(line[:-1].split('=')[1])

print(spearmanr(affinity_jsd, dataset0['COMPARE']))
print(spearmanr(affinity_jsd, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.38271675168275743, pvalue=0.0011723525606733528)
SpearmanrResult(correlation=0.21338953734228122, pvalue=0.07832255290651878)


## Cosine

In [17]:
def calc_cosine(words, period1, period2):
    X = []
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X.append(cosine(np.mean(emb1, axis=0), np.mean(emb2, axis=0)))
    return X

In [18]:
x_cosine = calc_cosine(dataset0['word'], sov, postsov)

print(spearmanr(x_cosine, dataset0['COMPARE']))
print(spearmanr(x_cosine, [np.abs(elem) for elem in dataset0['delta_later']]))

SpearmanrResult(correlation=-0.42176224555005987, pvalue=0.00030694633351874116)
SpearmanrResult(correlation=0.2915517070539309, pvalue=0.01507273920765314)


## Freq1, Freq2, Freq1 / Freq2

In [70]:
def calc_freq(words, X, period1, period2):
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X[idx].extend([len(emb1), len(emb2), len(emb1) / len(emb2)])

## CatBoostRegressor cross validation

In [73]:
def run_cv_cb(X, y):
    kf = KFold(n_splits=7)
    test_scores = []

    for train_idx, test_idx in kf.split(X):
        x_train, x_test = np.array(X)[train_idx], np.array(X)[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = CatBoostRegressor(verbose=False)
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        test_scores.append(spearmanr(pred, y_test))
        print('Features importance:', clf.feature_importances_)
        print('Spearman correlation:', test_scores[-1])

    print('Mean score is:', np.mean([np.abs(score[0]) for score in test_scores]))

## Посчитаем корреляцию модели, обученной на частотах слов

In [76]:
X = [[] for _ in range(dataset0.shape[0])]

calc_freq(dataset0['word'], X, presov, sov)

run_cv_cb(X, dataset0['COMPARE'])

Features importance: [37.61581227 33.71914597 28.66504176]
Spearman correlation: SpearmanrResult(correlation=-0.43636363636363645, pvalue=0.17966487658001123)
Features importance: [40.15206559 30.721257   29.12667741]
Spearman correlation: SpearmanrResult(correlation=-0.03647433262080388, pvalue=0.9203184602756675)
Features importance: [27.68424399 38.79872654 33.51702948]
Spearman correlation: SpearmanrResult(correlation=0.32121212121212117, pvalue=0.3654683104386702)
Features importance: [23.36105417 43.90996715 32.72897868]
Spearman correlation: SpearmanrResult(correlation=0.41818181818181815, pvalue=0.22911284098281892)
Features importance: [32.22465978 40.24522792 27.5301123 ]
Spearman correlation: SpearmanrResult(correlation=0.34545454545454546, pvalue=0.32822651147136733)
Features importance: [30.62213442 37.41263947 31.96522611]
Spearman correlation: SpearmanrResult(correlation=0.22424242424242422, pvalue=0.5334005612725947)
Features importance: [36.29699823 33.82188437 29.8811

In [78]:
run_cv_cb(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Features importance: [27.34557363 30.86381344 41.79061293]
Spearman correlation: SpearmanrResult(correlation=0.19590028045862823, pvalue=0.5637442329087623)
Features importance: [28.13914937 35.57545976 36.28539088]
Spearman correlation: SpearmanrResult(correlation=0.07878787878787878, pvalue=0.8287173946974606)
Features importance: [28.37580894 30.86795403 40.75623703]
Spearman correlation: SpearmanrResult(correlation=0.23708316203522517, pvalue=0.5095614092618308)
Features importance: [29.60498303 30.67942434 39.71559263]
Spearman correlation: SpearmanrResult(correlation=-0.3939393939393939, pvalue=0.25999776683488757)
Features importance: [26.4642791  37.26365416 36.27206674]
Spearman correlation: SpearmanrResult(correlation=-0.2121212121212121, pvalue=0.5563057751029299)
Features importance: [32.00616948 30.08562636 37.90820416]
Spearman correlation: SpearmanrResult(correlation=0.18787878787878787, pvalue=0.603217610029209)
Features importance: [27.35149636 32.77758924 39.87091439]

## Добавим cosine

In [79]:
for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])

run_cv_cb(X, dataset0['COMPARE'])

Features importance: [25.60114514 17.79274496 24.35921311 32.24689679]
Spearman correlation: SpearmanrResult(correlation=-0.39090909090909093, pvalue=0.23454006709519432)
Features importance: [24.95096104 15.01998189 24.65113779 35.37791927]
Spearman correlation: SpearmanrResult(correlation=-0.19452977397762067, pvalue=0.5902067758594245)
Features importance: [18.07772474 20.67008473 28.94158935 32.31060119]
Spearman correlation: SpearmanrResult(correlation=0.3696969696969697, pvalue=0.2930500752928223)
Features importance: [17.30996339 27.59452502 25.74146845 29.35404314]
Spearman correlation: SpearmanrResult(correlation=0.6, pvalue=0.06668799999999996)
Features importance: [21.8409625  27.07478616 20.26855983 30.81569151]
Spearman correlation: SpearmanrResult(correlation=0.11515151515151514, pvalue=0.7514196523258483)
Features importance: [20.6600691  23.10549687 22.64466091 33.58977312]
Spearman correlation: SpearmanrResult(correlation=0.22424242424242422, pvalue=0.5334005612725947)

In [80]:
run_cv_cb(X, np.array([np.abs(elem) for elem in dataset0['delta_later']]))

Features importance: [13.64741411 17.72618252 24.31692964 44.30947372]
Spearman correlation: SpearmanrResult(correlation=0.173121178079718, pvalue=0.6107177393174041)
Features importance: [15.74206996 22.10058045 22.23599977 39.92134982]
Spearman correlation: SpearmanrResult(correlation=0.28484848484848485, pvalue=0.42503815489214547)
Features importance: [16.89570779 21.2521629  26.46091672 35.39121259]
Spearman correlation: SpearmanrResult(correlation=0.735565707852878, pvalue=0.01532345637780498)
Features importance: [16.810539   23.18903294 22.49310279 37.50732527]
Spearman correlation: SpearmanrResult(correlation=-0.38181818181818183, pvalue=0.27625533338543595)
Features importance: [19.67107173 21.5271754  21.56654652 37.23520634]
Spearman correlation: SpearmanrResult(correlation=0.35757575757575755, pvalue=0.3103760917056799)
Features importance: [21.45903204 23.32927176 24.53970436 30.67199185]
Spearman correlation: SpearmanrResult(correlation=0.22424242424242422, pvalue=0.5334

In [82]:
X = [[] for _ in range(dataset0.shape[0])]

for i in range(dataset0.shape[0]):
    X[i].append(kmeans_jsd[i])
    X[i].append(x_cosine[i])
    
run_cv_cb(X, dataset0['COMPARE'])

Features importance: [48.24462627 51.75537373]
Spearman correlation: SpearmanrResult(correlation=0.018181818181818184, pvalue=0.9576852410216187)
Features importance: [44.23179341 55.76820659]
Spearman correlation: SpearmanrResult(correlation=-0.21276694028802262, pvalue=0.5550759959985581)
Features importance: [54.34514942 45.65485058]
Spearman correlation: SpearmanrResult(correlation=0.6565379871744698, pvalue=0.039204386332556795)
Features importance: [48.28807939 51.71192061]
Spearman correlation: SpearmanrResult(correlation=0.5272727272727272, pvalue=0.11730806555020223)
Features importance: [59.99034328 40.00965672]
Spearman correlation: SpearmanrResult(correlation=0.41818181818181815, pvalue=0.22911284098281892)
Features importance: [52.08848137 47.91151863]
Spearman correlation: SpearmanrResult(correlation=0.10303030303030303, pvalue=0.7769984634438898)
Features importance: [54.27789586 45.72210414]
Spearman correlation: SpearmanrResult(correlation=0.4545454545454545, pvalue=0.

In [84]:
run_cv_lr(X, dataset0['COMPARE'])

Spearman correlation: SpearmanrResult(correlation=-0.10909090909090911, pvalue=0.7495085962341482)
Spearman correlation: SpearmanrResult(correlation=-0.12158110873601292, pvalue=0.7379379712336098)
Spearman correlation: SpearmanrResult(correlation=0.4424242424242424, pvalue=0.20042268671194224)
Spearman correlation: SpearmanrResult(correlation=0.8060606060606059, pvalue=0.004862061101964606)
Spearman correlation: SpearmanrResult(correlation=0.5151515151515151, pvalue=0.12755286970323426)
Spearman correlation: SpearmanrResult(correlation=0.28484848484848485, pvalue=0.42503815489214547)
Spearman correlation: SpearmanrResult(correlation=0.6121212121212121, pvalue=0.05997214247725524)
Mean score is: 0.41303972549042606


## LinearRegressor cross validation

In [83]:
def run_cv_lr(X, y):
    kf = KFold(n_splits=7)
    test_scores = []

    for train_idx, test_idx in kf.split(X):
        x_train, x_test = np.array(X)[train_idx], np.array(X)[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LinearRegression()
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        test_scores.append(spearmanr(pred, y_test))
        print('Spearman correlation:', test_scores[-1])

    print('Mean score is:', np.mean([np.abs(score[0]) for score in test_scores]))