## Spearman's rank correlation for German between Self-Similarities and gold standard for graded task

In [292]:
import csv 
import pandas as pd
import numpy as np
from scipy import stats

Reading corpora from csv files

In [293]:
selfsim_dta = pd.read_csv('self_sim_by_layer_dta.csv', delimiter=',')

In [294]:
selfsim_bznd = pd.read_csv('self_sim_by_layer_bznd.csv', delimiter=',')

Self-similarities by layer for a specific word from DTA corpus

In [295]:
selfsim_dta.loc[selfsim_dta['word'] == 'Truppenteil']

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12
8,Truppenteil,0.888258,0.886599,0.882489,0.851886,0.812701,0.80664,0.779088,0.803833,0.799936,0.851698,0.885289,0.836558


Self-similarities by layer for a specific word from BZND corpus

In [296]:
selfsim_bznd.loc[selfsim_bznd['word'] == 'Truppenteil']

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12
17,Truppenteil,0.899127,0.895325,0.888385,0.850931,0.805687,0.78724,0.755274,0.786764,0.783913,0.843833,0.875198,0.823644


In [297]:
def get_substraction_abs(w):
    
    # x1 stores self-similarities from 12 layers for a word w in DTA corpus
    x1 = np.array(selfsim_dta.loc[selfsim_dta['word'] == w].drop(['word'], axis=1).values[0])
    # x2 stores self-similarities from 12 layers for a word w in BZND corpus
    x2 = np.array(selfsim_bznd.loc[selfsim_bznd['word'] == w].drop(['word'], axis=1).values[0])
    
    # abs substraction between two rows
    return np.absolute(x1-x2)

In [298]:
get_substraction_abs('Truppenteil')

array([0.01086935, 0.00872585, 0.00589665, 0.00095447, 0.00701378,
       0.01939981, 0.023814  , 0.01706849, 0.0160228 , 0.00786435,
       0.01009153, 0.01291421])

In [299]:
rows = [] # rows will store 

for word in list(selfsim_dta['word']): 
    row = []
    row.append(word)
    row.extend(list(get_substraction_abs(word)))
    rows.append(row)

In [300]:
head = ['word',1,2,3,4,5,6,7,8,9,10,11,12]

In [301]:
with open('self_sim_by_layer_abs.csv', 'w', newline='') as csvfile: # output path
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(head)
    for row in rows:
        writer.writerow(row)

## Calculating Spearman's correlation between Self-Similarities from 12 layers and golden standard for graded task 

In [302]:
golden = pd.read_csv('graded.tsv', delimiter='\t') 

In [303]:
golden = golden.sort_values('word')

In [304]:
golden

Unnamed: 0,word,change graded
3,Abgesang,0.578548
4,Ackergerät,0.0
5,Armenhaus,0.51967
8,Ausnahmegesetz,0.093138
11,Dynamik,0.578845
12,Einreichung,0.0
13,Eintagsfliege,0.66006
14,Engpaß,0.819957
15,Entscheidung,0.141681
16,Festspiel,0.100364


In [305]:
self_sim_sumbstracted = pd.read_csv('self_sim_by_layer_abs.csv', delimiter=',')  

In [306]:
self_sim_sumbstracted = self_sim_sumbstracted.sort_values('word')

In [307]:
for w1,w2 in zip(self_sim_sumbstracted['word'],golden['word']):
    print(f'{w1}:{w2}')

Abgesang:Abgesang
Ackergerät:Ackergerät
Armenhaus:Armenhaus
Ausnahmegesetz:Ausnahmegesetz
Dynamik:Dynamik
Einreichung:Einreichung
Eintagsfliege:Eintagsfliege
Engpaß:Engpaß
Entscheidung:Entscheidung
Festspiel:Festspiel
Frechheit:Frechheit
Fuß:Fuß
Gesichtsausdruck:Gesichtsausdruck
Knotenpunkt:Knotenpunkt
Kubikmeter:Kubikmeter
Lyzeum:Lyzeum
Manschette:Manschette
Mißklang:Mißklang
Mulatte:Mulatte
Naturschönheit:Naturschönheit
Ohrwurm:Ohrwurm
Pachtzins:Pachtzins
Rezeption:Rezeption
Schmiere:Schmiere
Seminar:Seminar
Sensation:Sensation
Spielball:Spielball
Tier:Tier
Titel:Titel
Tragfähigkeit:Tragfähigkeit
Truppenteil:Truppenteil
Unentschlossenheit:Unentschlossenheit
abbauen:abbauen
abdecken:abdecken
abgebrüht:abgebrüht
artikulieren:artikulieren
aufrechterhalten:aufrechterhalten
ausspannen:ausspannen
beimischen:beimischen
packen:packen
verbauen:verbauen
vergönnen:vergönnen
voranstellen:voranstellen
vorliegen:vorliegen
vorweisen:vorweisen
weitgreifend:weitgreifend
zersetzen:zersetzen
überspanne

In [308]:
golden = np.array(golden.drop(['word'], axis=1))

In [309]:
golden = list(golden.flatten())

In [310]:
self_sim_sumbstracted['golden'] = golden

In [311]:
self_sim_sumbstracted

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12,golden
32,Abgesang,0.01102,0.002561,0.011353,0.021235,0.017829,0.001356,0.023324,0.037613,0.052643,0.055707,0.047004,0.058954,0.578548
40,Ackergerät,0.007444,0.008843,0.005375,0.002555,0.002796,0.006492,0.003746,0.006393,0.010627,0.010511,0.00837,0.008637,0.0
24,Armenhaus,0.033176,0.025343,0.030105,0.036355,0.038043,0.034922,0.028176,0.022432,0.014636,0.008584,0.003985,0.008342,0.51967
17,Ausnahmegesetz,0.002391,0.005811,0.004584,0.002632,0.00146,0.005151,0.000483,0.00073,0.002316,0.000371,0.007439,0.013079,0.093138
33,Dynamik,0.00919,0.006713,0.011924,0.014094,0.003301,0.007499,0.017261,0.01702,0.016765,0.011824,0.00714,0.001908,0.578845
22,Einreichung,0.000956,0.002096,0.003,0.006522,0.014963,0.01144,0.021542,0.019075,0.020102,0.016003,0.012505,0.008955,0.0
41,Eintagsfliege,0.018112,0.015586,0.017446,0.021537,0.0299,0.04095,0.050922,0.05098,0.054668,0.04449,0.03119,0.048582,0.66006
47,Engpaß,0.02057,0.014161,0.017895,0.0234,0.031453,0.014504,0.018184,0.008112,0.007458,0.004278,0.002647,0.002393,0.819957
2,Entscheidung,0.002281,0.013102,0.020484,0.016108,0.016498,0.021139,0.029649,0.02872,0.039561,0.0342,0.0281,0.034726,0.141681
34,Festspiel,0.007283,0.008843,0.007165,0.011196,0.015295,0.016453,0.022078,0.027105,0.030338,0.023218,0.013634,0.018352,0.100364


In [312]:
stats.spearmanr(golden, list(self_sim_sumbstracted['3']))

SpearmanrResult(correlation=0.2015019948006291, pvalue=0.16963370524959254)

In [313]:
def get_spearman_by_layer(golden_labels):
    rows = []
    for i in range(1,13):
        ssi = self_sim_sumbstracted[str(i)]
        my_cor = stats.spearmanr(golden_labels,ssi)
        rows.append([i, my_cor.correlation, my_cor.pvalue])
    return rows

In [314]:
rows = get_spearman_by_layer(golden)

In [315]:
rows

[[1, 0.1682628698483655, 0.2529539651491603],
 [2, 0.24355411196609514, 0.09528835036577149],
 [3, 0.2015019948006291, 0.16963370524959254],
 [4, 0.17685825731556296, 0.22916284831249964],
 [5, 0.13170527251952566, 0.37221932110290334],
 [6, 0.030736670373206113, 0.8357087910020948],
 [7, 0.04118169818144606, 0.7810797358869471],
 [8, -0.025242150536579886, 0.8647742622922456],
 [9, -0.16265954763873677, 0.26932909976355557],
 [10, -0.12468751946086443, 0.398450581389707],
 [11, -0.17571583239903674, 0.23223292719921998],
 [12, -0.07975213941083215, 0.590001543307526]]

In [316]:
header = ['layer', 'correlation', 'pvalue']

In [317]:
with open('spearman_german.csv', 'w', newline='') as csvfile: # output path
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(header)
    for row in rows:
        writer.writerow(row)

In [325]:
spearman_ranked = pd.read_csv('spearman_german.csv', delimiter=',')

### Result: correlation of self-similarities by layer

In [326]:
spearman_ranked

Unnamed: 0,layer,correlation,pvalue
0,1,0.168263,0.252954
1,2,0.243554,0.095288
2,3,0.201502,0.169634
3,4,0.176858,0.229163
4,5,0.131705,0.372219
5,6,0.030737,0.835709
6,7,0.041182,0.78108
7,8,-0.025242,0.864774
8,9,-0.16266,0.269329
9,10,-0.124688,0.398451


### Correlation for summed embeddings

In [327]:
selfsim_summed_dta = pd.read_csv('self_sim_summed_dta.csv', delimiter=',').sort_values('word')

In [328]:
selfsim_summed_bznd = pd.read_csv('self_sim_summed_bznd.csv', delimiter=',').sort_values('word')

In [333]:
def get_substraction_abs2(w):
    
    x1 = selfsim_summed_dta.loc[selfsim_summed_dta.word == w, 'self-sim'].values[0]
    x2 = selfsim_summed_bznd.loc[selfsim_summed_bznd.word == w, 'self-sim'].values[0]
    
    return np.absolute(x1-x2)

In [330]:
selfsim_summed_dta.head()

Unnamed: 0,word,self-sim
32,Abgesang,0.840302
40,Ackergerät,0.86472
24,Armenhaus,0.845617
17,Ausnahmegesetz,0.860307
33,Dynamik,0.807813


In [331]:
selfsim_summed_bznd.head()

Unnamed: 0,word,self-sim
42,Abgesang,0.828037
44,Ackergerät,0.859309
19,Armenhaus,0.87043
27,Ausnahmegesetz,0.861512
34,Dynamik,0.810633


In [334]:
get_substraction_abs2('Abgesang')

0.012264490822555985

In [335]:
substractions = {}

for word in list(selfsim_summed_dta['word']):
    substractions[word] = get_substraction_abs2(word)

In [336]:
substractions

{'Abgesang': 0.012264490822555985,
 'Ackergerät': 0.005411241630272068,
 'Armenhaus': 0.024813140390429922,
 'Ausnahmegesetz': 0.001205084819232094,
 'Dynamik': 0.0028193588467209096,
 'Einreichung': 0.011282821398651999,
 'Eintagsfliege': 0.033000491422845,
 'Engpaß': 0.014588227342220983,
 'Entscheidung': 0.01829560279317799,
 'Festspiel': 0.01567330670534095,
 'Frechheit': 0.00760051581502208,
 'Fuß': 0.0050152613603590135,
 'Gesichtsausdruck': 0.009190895509325059,
 'Knotenpunkt': 0.019056711304542007,
 'Kubikmeter': 0.038669688795992974,
 'Lyzeum': 0.020611228190739017,
 'Manschette': 0.011270485815647935,
 'Mißklang': 0.030363100798239007,
 'Mulatte': 0.012277600303920999,
 'Naturschönheit': 0.0020151982298459714,
 'Ohrwurm': 0.03738758129239106,
 'Pachtzins': 0.018216159057154013,
 'Rezeption': 0.016296517612648942,
 'Schmiere': 0.004254987659014997,
 'Seminar': 0.012169475076987069,
 'Sensation': 0.030046516875091966,
 'Spielball': 0.0044275796557740055,
 'Tier': 0.033426202865

In [337]:
stats.spearmanr(golden, list(substractions.values()))

SpearmanrResult(correlation=0.1367645828641419, pvalue=0.35396720843360285)