In [2]:
import numpy as np
import pandas as pd

In [3]:
presov = np.load('bert/presov.npz')
sov = np.load('bert/sov.npz')
postsov = np.load('bert/postsov.npz')

In [71]:
dataset0 = pd.read_csv('context-diachrony/datasets/dataset_0_testset.tsv', sep='\t')

In [72]:
dataset0 = dataset0.sample(frac=1).reset_index(drop=True)

In [73]:
dataset0.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,участок,1.9,2.0,2.12,0.12
1,обвинитель,3.48,3.26,3.43,0.17
2,серединный,2.33,2.21,2.11,-0.1
3,присутствие,3.16,2.89,3.01,0.12
4,тройка,2.5,3.33,1.43,-1.9


In [131]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import jensenshannon as jsd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from sklearn.model_selection import KFold
from tqdm import tqdm

K = 5

## Kmeans (k = 5) + Jensenshannon

In [24]:
def calc_kmeans_jsd(words, X, period1, period2):
    for idx, word in enumerate(words):
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        if emb1.shape[0] > 1000:
            rand_idxs = np.random.choice(emb1.shape[0], 1000, replace=False)
            emb1 = emb1[rand_idxs]
        if emb2.shape[0] > 1000:
            rand_idxs = np.random.choice(emb2.shape[0], 1000, replace=False)
            emb2 = emb2[rand_idxs]
        
        embs = []
        embs.extend(emb1)
        embs.extend(emb2)

        kmeans = KMeans(n_clusters=K, random_state=42).fit(embs)
        dist1 = [np.count_nonzero(kmeans.labels_[:len(emb1)] == i) / len(emb1) for i in range(K)]
        dist2 = [np.count_nonzero(kmeans.labels_[len(emb1):] == i) / len(emb2) for i in range(K)]

        X[idx].append(jsd(dist1, dist2))

## Kmeans + max(square())

In [25]:
def calc_kmeans_max_square(words, X, period1, period2):
    for idx, word in enumerate(words):
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        if emb1.shape[0] > 1000:
            rand_idxs = np.random.choice(emb1.shape[0], 1000, replace=False)
            emb1 = emb1[rand_idxs]
        if emb2.shape[0] > 1000:
            rand_idxs = np.random.choice(emb2.shape[0], 1000, replace=False)
            emb2 = emb2[rand_idxs]
        
        embs = []
        embs.extend(emb1)
        embs.extend(emb2)

        kmeans = KMeans(n_clusters=K, random_state=42).fit(embs)
        dist1 = [np.count_nonzero(kmeans.labels_[:len(emb1)] == i) / len(emb1) for i in range(K)]
        dist2 = [np.count_nonzero(kmeans.labels_[len(emb1):] == i) / len(emb2) for i in range(K)]

        X[idx].append(np.max(np.square(dist1) - np.square(dist2)))

## AffinityPropagation + Jensenshannon

In [140]:
def calc_aff_jsd(words, X, period1, period2):
    for idx, word in tqdm(enumerate(words)):
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        if emb1.shape[0] > 1000:
            rand_idxs = np.random.choice(emb1.shape[0], 1000, replace=False)
            emb1 = emb1[rand_idxs]
        if emb2.shape[0] > 1000:
            rand_idxs = np.random.choice(emb2.shape[0], 1000, replace=False)
            emb2 = emb2[rand_idxs]
        
        embs = []
        embs.extend(emb1)
        embs.extend(emb2)
        
        clustering = AffinityPropagation(random_state=42).fit(embs)
        dist1 = [np.count_nonzero(clustering.labels_[:len(emb1)] == i) / len(emb1) 
                 for i in range(np.max(clustering.labels_))]
        dist2 = [np.count_nonzero(clustering.labels_[len(emb1):] == i) / len(emb2) 
                 for i in range(np.max(clustering.labels_))]

        X[idx].append(jsd(dist1, dist2))

## Cosine(mean())

In [27]:
def calc_cosine(words, X, period1, period2):
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X[idx].append(cosine(np.mean(emb1, axis=0), np.mean(emb2, axis=0)))

## Freq1, Freq2, Freq1 / Freq2

In [130]:
def calc_freq(words, X, period1, period2):
    for idx, word in enumerate(words): 
        emb1 = period1.get(word)
        emb2 = period2.get(word)
        X[idx].extend([len(emb1), len(emb2), len(emb1) / len(emb2)])

## Сначала посчитаем корреляции с отдельными признаками

In [132]:
X = [[] for _ in range(dataset0.shape[0])]
y = dataset0['COMPARE']

calc_kmeans_max_square(dataset0['word'], X, presov, sov)
spearmanr(X, y)

SpearmanrResult(correlation=-0.19106870488655406, pvalue=0.11045759496058974)

In [133]:
X = [[] for _ in range(dataset0.shape[0])]
y = dataset0['COMPARE']

calc_kmeans_jsd(dataset0['word'], X, presov, sov)
spearmanr(X, y)

SpearmanrResult(correlation=-0.40459142029119627, pvalue=0.0004660609752127064)

In [135]:
X = [[] for _ in range(dataset0.shape[0])]
y = dataset0['COMPARE']

calc_cosine(dataset0['word'], X, presov, sov)
spearmanr(X, y)

SpearmanrResult(correlation=-0.5313668642917534, pvalue=1.86266882459779e-06)

In [136]:
X = [[] for _ in range(dataset0.shape[0])]
y = dataset0['COMPARE']

calc_aff_jsd(dataset0['word'], X, presov, sov)
spearmanr(X, y)

71it [06:04,  5.13s/it]


SpearmanrResult(correlation=-0.3626081084082658, pvalue=0.0018857695118743574)

In [141]:
def run_cv(X, y):
    kf = KFold(n_splits=7)
    test_scores = []

    for train_idx, test_idx in kf.split(X):
        x_train, x_test = np.array(X)[train_idx], np.array(X)[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        clf = LinearRegression()
        clf.fit(x_train, y_train)
        pred = clf.predict(x_test)
        test_scores.append(spearmanr(pred, y_test))
        print('Spearman correlation:', test_scores[-1])

    print('Mean score is:', np.mean([np.abs(score[0]) for score in test_scores]))

## Посчитаем корреляцию модели, обученной на частотах слов

In [142]:
X = [[] for _ in range(dataset0.shape[0])]
y = dataset0['COMPARE']

calc_freq(dataset0['word'], X, presov, sov)

run_cv(X, y)

Spearman correlation: SpearmanrResult(correlation=0.18181818181818182, pvalue=0.5926152128455)
Spearman correlation: SpearmanrResult(correlation=0.16363636363636364, pvalue=0.6514773427962428)
Spearman correlation: SpearmanrResult(correlation=0.2857156055296304, pvalue=0.42356778382654603)
Spearman correlation: SpearmanrResult(correlation=0.2121212121212121, pvalue=0.5563057751029299)
Spearman correlation: SpearmanrResult(correlation=-0.13939393939393938, pvalue=0.7009318849100584)
Spearman correlation: SpearmanrResult(correlation=0.7696969696969697, pvalue=0.009221952722215994)
Spearman correlation: SpearmanrResult(correlation=0.8389096502784891, pvalue=0.0024139883688156136)
Mean score is: 0.37018456035354086


## Добавим к ней признак kmeans + jensenshannon

In [143]:
calc_kmeans_jsd(dataset0['word'], X, presov, sov)
run_cv(X, y)

Spearman correlation: SpearmanrResult(correlation=0.5818181818181819, pvalue=0.060419896215648305)
Spearman correlation: SpearmanrResult(correlation=0.24848484848484845, pvalue=0.48877630451924314)
Spearman correlation: SpearmanrResult(correlation=0.33434804902403553, pvalue=0.34503632799221085)
Spearman correlation: SpearmanrResult(correlation=0.4545454545454545, pvalue=0.186904810349332)
Spearman correlation: SpearmanrResult(correlation=0.33333333333333326, pvalue=0.34659350708733405)
Spearman correlation: SpearmanrResult(correlation=0.5393939393939393, pvalue=0.1075931877824148)
Spearman correlation: SpearmanrResult(correlation=0.7841981513472833, pvalue=0.007245486403052556)
Mean score is: 0.468017422563868


## Добавим к ним cosine

In [144]:
calc_cosine(dataset0['word'], X, presov, sov)
run_cv(X, y)

Spearman correlation: SpearmanrResult(correlation=0.5636363636363637, pvalue=0.07095173447637536)
Spearman correlation: SpearmanrResult(correlation=0.24848484848484845, pvalue=0.48877630451924314)
Spearman correlation: SpearmanrResult(correlation=0.33434804902403553, pvalue=0.34503632799221085)
Spearman correlation: SpearmanrResult(correlation=0.4545454545454545, pvalue=0.186904810349332)
Spearman correlation: SpearmanrResult(correlation=0.23636363636363633, pvalue=0.5108853175152002)
Spearman correlation: SpearmanrResult(correlation=0.5393939393939393, pvalue=0.1075931877824148)
Spearman correlation: SpearmanrResult(correlation=0.4559291577600484, pvalue=0.18539676445371583)
Mean score is: 0.40467163560118946
