In [50]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr

## Pre-Soviet vs. Soviet

In [81]:
# load dataset filtered by compare mean <3 or delta_later >0.2
dataset = pd.read_csv('dataset_0_DUREL_filtered.tsv', sep='\t')
dataset.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,влажный,2.82,3.03,2.59,-0.44
1,дума,2.25,2.38,2.3,-0.08
2,заведующий,3.31,3.02,3.38,0.36
3,заключенный,1.71,2.49,3.4,0.91
4,защитник,2.66,2.9,2.61,-0.29


In [82]:
ranked_delta = dataset.reindex(dataset.delta_later.abs().sort_values(ascending=False).index)
ranked_delta = ranked_delta.reset_index(drop=True)
ranked_delta

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,тройка,2.5,3.33,1.43,-1.9
1,заключенный,1.71,2.49,3.4,0.91
2,свалка,1.9,3.09,2.34,-0.75
3,насаждение,2.48,2.11,2.78,0.67
4,палата,1.46,2.16,1.53,-0.63
5,отдел,2.55,2.13,2.74,0.61
6,червяк,1.92,2.48,1.9,-0.58
7,трибунал,2.98,2.8,3.37,0.57
8,привет,3.24,3.03,3.55,0.52
9,особа,2.95,2.79,3.3,0.51


In [83]:
ranked_compare = dataset.reindex(dataset.COMPARE.sort_values(ascending=True).index)
ranked_compare = ranked_compare.reset_index(drop=True)
ranked_compare

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,палата,1.46,2.16,1.53,-0.63
1,управление,1.66,2.07,2.18,0.11
2,заключенный,1.71,2.49,3.4,0.91
3,пионер,1.86,2.77,2.64,-0.13
4,свалка,1.9,3.09,2.34,-0.75
5,участок,1.9,2.0,2.12,0.12
6,элемент,1.91,2.15,1.86,-0.29
7,червяк,1.92,2.48,1.9,-0.58
8,трубка,2.04,2.08,2.12,0.04
9,отделение,2.09,2.09,2.18,0.09


In [84]:
target_words = dataset.word.tolist()

In [21]:
# token embeddings extracted using https://github.com/ltgoslo/simple_elmo

array0 = np.load('presoviet_incr_embs_0.npz')
print(len(array0))

array1 = np.load('soviet_incr_embs_0.npz')
print(len(array1))

71
71


In [15]:
print(array0['собор'].shape)
array0['собор']

(5384, 1024)


array([[-0.72170472,  0.09754667,  0.73101729, ..., -0.84733498,
        -0.20107049,  0.36461118],
       [-0.47427469,  0.004144  ,  0.08054993, ..., -0.65336466,
        -0.59087026, -0.26150763],
       [-1.70223987, -0.26661727,  0.1770528 , ..., -0.13750337,
        -0.12923655,  0.34274617],
       ...,
       [-0.58482593, -0.21871354, -0.39251336, ..., -0.1588814 ,
        -0.10788606, -0.39430529],
       [-0.33000177,  0.23090473,  0.17835674, ..., -0.50444502,
        -0.12523156,  0.37941808],
       [-0.81413406, -0.49671683,  0.09676844, ..., -1.15526485,
         0.10718139,  0.4539662 ]])

In [10]:
np.average(array0['собор'], axis=0)

array([-0.80992817, -0.14719359,  0.01371795, ..., -0.46227408,
       -0.34614951,  0.10017612])

In [85]:
cosines = []
for word in target_words:
    vector1 = np.average(array0[word], axis=0)
    vector2 = np.average(array1[word], axis=0)
    cosdist = cosine(vector1, vector2)
    cosines.append((word, cosdist))

In [86]:
cosines[:3]

[('влажный', 0.17390895976294163),
 ('дума', 0.22563745040880978),
 ('заведующий', 0.19392648322273387)]

In [87]:
sorted_cosines = sorted(cosines, key=lambda tup: tup[1], reverse=True)
sorted_cosines = [tup[0] for tup in sorted_cosines]
sorted_cosines

['заключенный',
 'отдел',
 'пионер',
 'отделение',
 'машина',
 'пакет',
 'дума',
 'элемент',
 'участок',
 'привет',
 'палата',
 'пружина',
 'трубка',
 'заведующий',
 'уклон',
 'управа',
 'управление',
 'стиль',
 'секция',
 'защитник',
 'влажный',
 'червяк',
 'собор',
 'тройка',
 'трибунал',
 'знатный',
 'насаждение',
 'молодец',
 'свалка',
 'передовой',
 'серединный',
 'особа',
 'мазь',
 'классный']

In [88]:
ranks_cos = []
ranks_delta = []
ranks_compare = []
for word in target_words:
    ranks_cos.append(sorted_cosines.index(word))
    ranks_delta.append(ranked_delta.word.tolist().index(word))
    ranks_compare.append(ranked_compare.word.tolist().index(word))

In [89]:
coef, p = spearmanr(ranks_cos, ranks_delta)
print('Spearmans correlation coefficient: %.3f' % coef)
print('p=%.3f' % p)

Spearmans correlation coefficient: 0.001
p=0.995


In [90]:
coef, p = spearmanr(ranks_cos, ranks_compare)
print('Spearmans correlation coefficient: %.3f' % coef)
print('p=%.3f' % p)

Spearmans correlation coefficient: 0.346
p=0.045


In [91]:
d = [r1-r2 for r1,r2 in zip(ranks_cos, ranks_delta)]
d_2 = [diff**2 for diff in d]
summ = sum(d_2)
print(1 - ((6*summ)/((len(d)**3)-len(d))))

0.0010695187165775666


## Soviet vs. Post-Soviet

In [107]:
dataset = pd.read_csv('dataset_1_DUREL_filtered.tsv', sep='\t')
dataset.head()

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,блочный,2.15,2.57,2.63,0.06
1,боевик,2.25,3.28,2.82,-0.46
2,ботаник,2.52,3.2,2.16,-1.04
3,взлом,3.26,3.4,3.03,-0.37
4,выплеск,3.16,2.76,3.68,0.92


In [108]:
ranked_delta = dataset.reindex(dataset.delta_later.abs().sort_values(ascending=False).index)
ranked_delta = ranked_delta.reset_index(drop=True)
ranked_delta

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,провальный,1.96,1.87,3.65,1.78
1,инкубатор,2.19,3.54,2.49,-1.05
2,ботаник,2.52,3.2,2.16,-1.04
3,рыночный,2.36,2.33,3.37,1.04
4,выплеск,3.16,2.76,3.68,0.92
5,рождество,3.15,3.6,2.69,-0.91
6,удвоение,2.98,2.87,3.7,0.83
7,орбита,2.53,2.26,3.07,0.81
8,дивизион,3.58,3.79,3.01,-0.78
9,карта,1.85,2.28,1.58,-0.7


In [109]:
ranked_compare = dataset.reindex(dataset.COMPARE.sort_values(ascending=True).index)
ranked_compare = ranked_compare.reset_index(drop=True)
ranked_compare

Unnamed: 0,word,COMPARE,EARLIER,LATER,delta_later
0,подстава,1.0,3.3,2.8,-0.5
1,четырехлетка,1.07,3.07,2.8,-0.27
2,драйвер,1.29,3.0,2.8,-0.2
3,двушка,1.38,2.65,2.56,-0.09
4,классный,1.43,2.44,2.17,-0.27
5,пакет,1.65,2.78,2.23,-0.55
6,корпус,1.77,2.26,1.95,-0.31
7,рынок,1.82,2.59,3.1,0.51
8,карта,1.85,2.28,1.58,-0.7
9,завязка,1.86,2.07,1.77,-0.3


In [110]:
target_words = dataset.word.tolist()

In [111]:
# token embeddings extracted using https://github.com/ltgoslo/simple_elmo

array0 = np.load('soviet_incr_embs_1.npz')
print(len(array0))

array1 = np.load('postsoviet_incr_embs_1.npz')
print(len(array1))

69
69


In [112]:
cosines = []
for word in target_words:
    vector1 = np.average(array0[word], axis=0)
    vector2 = np.average(array1[word], axis=0)
    cosdist = cosine(vector1, vector2)
    cosines.append((word, cosdist))

In [113]:
sorted_cosines = sorted(cosines, key=lambda tup: tup[1], reverse=True)
sorted_cosines = [tup[0] for tup in sorted_cosines]
sorted_cosines

['четырехлетка',
 'рынок',
 'исполнитель',
 'орбита',
 'флакон',
 'приставка',
 'драйвер',
 'раскрутка',
 'двушка',
 'подстава',
 'ботаник',
 'блочный',
 'рождество',
 'классный',
 'донорство',
 'рыночный',
 'гермафродит',
 'дивизион',
 'стачка',
 'глянец',
 'знамя',
 'модель',
 'перестройка',
 'провальный',
 'пакет',
 'стиль',
 'корпус',
 'удвоение',
 'боевик',
 'завязка',
 'однодневка',
 'инкубатор',
 'приношение',
 'мыло',
 'погрешность',
 'ломберный',
 'взлом',
 'игра',
 'свинцовый',
 'выплеск',
 'загрузка',
 'карта',
 'высоколобый']

In [114]:
ranks_cos = []
ranks_delta = []
ranks_compare = []
for word in target_words:
    ranks_cos.append(sorted_cosines.index(word))
    ranks_delta.append(ranked_delta.word.tolist().index(word))
    ranks_compare.append(ranked_compare.word.tolist().index(word))

In [115]:
coef, p = spearmanr(ranks_cos, ranks_delta)
print('Spearmans correlation coefficient: %.3f' % coef)
print('p=%.3f' % p)

Spearmans correlation coefficient: -0.072
p=0.645


In [116]:
coef, p = spearmanr(ranks_cos, ranks_compare)
print('Spearmans correlation coefficient: %.3f' % coef)
print('p=%.3f' % p)

Spearmans correlation coefficient: 0.223
p=0.150
