## Spearman's rank correlation for Latin between Self-Similarities and gold standard for graded task

In [1]:
import csv 
import pandas as pd
import numpy as np
from scipy import stats

Reading corpora from csv files

In [2]:
selfsim_ise1 = pd.read_csv('self_sim_by_layer_ISE1.csv', delimiter=',').drop(['instances'], axis=1)

In [3]:
selfsim_ise2 = pd.read_csv('self_sim_by_layer_ISE2.csv', delimiter=',').drop(['instances'], axis=1)

In [4]:
selfsim_ise1.head()

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12
0,credo,0.911239,0.890187,0.868195,0.858112,0.832529,0.812822,0.811099,0.792844,0.802818,0.791391,0.768484,0.7094
1,virtus,0.85386,0.84912,0.83009,0.831719,0.818082,0.777857,0.774065,0.770175,0.788582,0.794453,0.744757,0.750067
2,salus,0.847061,0.835808,0.81343,0.817187,0.810324,0.786888,0.784174,0.775622,0.797745,0.804519,0.759983,0.765689
3,consul,0.910307,0.888482,0.864848,0.847339,0.828357,0.800825,0.793957,0.791107,0.804972,0.830793,0.821337,0.766356
4,hostis,0.878028,0.884818,0.863559,0.86919,0.857252,0.833382,0.807957,0.796528,0.818132,0.822134,0.780174,0.806656


Self-similarities by layer for a specific word from DTA corpus

In [5]:
selfsim_ise1.loc[selfsim_ise1['word'] == 'credo']

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12
0,credo,0.911239,0.890187,0.868195,0.858112,0.832529,0.812822,0.811099,0.792844,0.802818,0.791391,0.768484,0.7094


Self-similarities by layer for a specific word from BZND corpus

In [6]:
selfsim_ise2.loc[selfsim_ise2['word'] == 'credo']

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12
17,credo,0.908242,0.886388,0.864945,0.852445,0.823757,0.802737,0.80103,0.78319,0.794621,0.782763,0.758605,0.693534


In [7]:
def get_substraction_abs(w):
    
    # x1 stores self-similarities from 12 layers for a word w in DTA corpus
    x1 = np.array(selfsim_ise1.loc[selfsim_ise1['word'] == w].drop(['word'], axis=1).values[0])
    # x2 stores self-similarities from 12 layers for a word w in BZND corpus
    x2 = np.array(selfsim_ise2.loc[selfsim_ise2['word'] == w].drop(['word'], axis=1).values[0])
    
    # abs substraction between two rows
    return np.absolute(x1-x2)

In [8]:
get_substraction_abs('credo')

array([0.00299653, 0.00379879, 0.00324961, 0.00566659, 0.0087713 ,
       0.0100847 , 0.01006929, 0.00965424, 0.0081976 , 0.00862796,
       0.00987919, 0.01586582])

In [9]:
rows = [] # rows will store 

for word in list(selfsim_ise1['word']): 
    row = []
    row.append(word)
    row.extend(list(get_substraction_abs(word)))
    rows.append(row)

In [10]:
head = ['word',1,2,3,4,5,6,7,8,9,10,11,12]

In [11]:
with open('self_sim_by_layer_abs.csv', 'w', newline='') as csvfile: # output path
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(head)
    for row in rows:
        writer.writerow(row)

## Calculating Spearman's correlation between Self-Similarities from 12 layers and golden standard for graded task 

In [12]:
golden = pd.read_csv('graded.tsv', delimiter='\t') 

In [13]:
golden = golden.sort_values('word')

In [14]:
golden

Unnamed: 0,word,change_graded
0,acerbus,0.169367
1,adsumo,0.342616
2,ancilla,0.0
3,beatus,0.816392
4,civitas,0.322392
5,cohors,0.28083
6,consilium,0.102932
7,consul,0.129886
8,credo,0.370992
9,dolus,0.176682


In [15]:
self_sim_sumbstracted = pd.read_csv('self_sim_by_layer_abs.csv', delimiter=',')  

In [16]:
self_sim_sumbstracted = self_sim_sumbstracted.sort_values('word')

In [17]:
for w1,w2 in zip(self_sim_sumbstracted['word'],golden['word']):
    print(f'{w1}:{w2}')

acerbus:acerbus
adsumo:adsumo
ancilla:ancilla
beatus:beatus
civitas:civitas
cohors:cohors
consilium:consilium
consul:consul
credo:credo
dolus:dolus
dubius:dubius
dux:dux
fidelis:fidelis
honor:honor
hostis:hostis
humanitas:humanitas
imperator:imperator
itero:itero
jus:jus
licet:licet
necessarius:necessarius
nepos:nepos
nobilitas:nobilitas
oportet:oportet
poena:poena
pontifex:pontifex
potestas:potestas
regnum:regnum
sacramentum:sacramentum
salus:salus
sanctus:sanctus
sapientia:sapientia
scriptura:scriptura
senatus:senatus
sensus:sensus
simplex:simplex
templum:templum
titulus:titulus
virtus:virtus
voluntas:voluntas


In [18]:
golden = np.array(golden.drop(['word'], axis=1))

In [19]:
golden = list(golden.flatten())

In [20]:
self_sim_sumbstracted['golden'] = golden

In [21]:
self_sim_sumbstracted

Unnamed: 0,word,1,2,3,4,5,6,7,8,9,10,11,12,golden
20,acerbus,0.004089,0.004875,0.007504,0.004157,0.002446,0.000507,0.001772,0.004057,0.001779,0.001483,0.003116,0.010823,0.169367
36,adsumo,0.003103,0.005267,0.002909,0.008449,0.012729,0.014625,0.020042,0.024075,0.029379,0.035259,0.037137,0.039751,0.342616
33,ancilla,0.018578,0.018695,0.027373,0.033104,0.041755,0.046438,0.061126,0.061459,0.055241,0.060693,0.068854,0.060816,0.0
23,beatus,0.000773,0.006014,0.010343,0.01427,0.021656,0.031943,0.029838,0.047785,0.061625,0.068244,0.063214,0.046782,0.816392
11,civitas,0.000744,0.000236,0.003714,0.006889,0.015565,0.021009,0.02492,0.029046,0.025454,0.02085,0.014842,0.030137,0.322392
15,cohors,0.002347,0.002062,0.005314,0.009151,0.00695,0.005535,0.002905,0.002552,0.004539,0.00682,0.006713,0.017268,0.28083
9,consilium,0.001589,0.002229,0.00305,0.00556,0.008314,0.009765,0.008217,0.009359,0.011733,0.015777,0.011544,0.017016,0.102932
3,consul,0.000979,0.002463,0.002682,0.007987,0.010379,0.012885,0.013176,0.012029,0.012409,0.014702,0.010149,0.043656,0.129886
0,credo,0.002997,0.003799,0.00325,0.005667,0.008771,0.010085,0.010069,0.009654,0.008198,0.008628,0.009879,0.015866,0.370992
6,dolus,0.004949,0.006574,0.00967,0.006173,0.005457,0.005153,0.008813,0.00938,0.010043,0.014839,0.013381,0.013061,0.176682


In [22]:
stats.spearmanr(golden, list(self_sim_sumbstracted['3']))

SpearmanrResult(correlation=-0.08649561433597279, pvalue=0.5956295871424422)

In [23]:
def get_spearman_by_layer(golden_labels):
    rows = []
    for i in range(1,13):
        ssi = self_sim_sumbstracted[str(i)]
        my_cor = stats.spearmanr(golden_labels,ssi)
        rows.append([i, my_cor.correlation, my_cor.pvalue])
    return rows

In [24]:
rows = get_spearman_by_layer(golden)

In [25]:
rows

[[1, 0.12589708724389967, 0.43888243959497875],
 [2, 0.052816736302768647, 0.7461834023243854],
 [3, -0.08649561433597279, 0.5956295871424422],
 [4, -0.014541019763639678, 0.929039097312775],
 [5, -0.054223931763766026, 0.7396535398465283],
 [6, 0.07964726309245218, 0.6251691723704361],
 [7, 0.021952249191559258, 0.8930453770690372],
 [8, 0.021201744945693983, 0.8966819094401907],
 [9, 0.14531638460566362, 0.3709580680514446],
 [10, 0.14409681520613257, 0.3750295149420654],
 [11, 0.04005816412305899, 0.8061352940729338],
 [12, 0.09475116104049082, 0.5608581023188711]]

In [26]:
header = ['layer', 'correlation', 'pvalue']

In [27]:
with open('spearman_latin.csv', 'w', newline='') as csvfile: # output path
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(header)
    for row in rows:
        writer.writerow(row)

In [28]:
spearman_ranked = pd.read_csv('spearman_latin.csv', delimiter=',')

### Result: correlation of self-similarities by layer

In [29]:
spearman_ranked

Unnamed: 0,layer,correlation,pvalue
0,1,0.125897,0.438882
1,2,0.052817,0.746183
2,3,-0.086496,0.59563
3,4,-0.014541,0.929039
4,5,-0.054224,0.739654
5,6,0.079647,0.625169
6,7,0.021952,0.893045
7,8,0.021202,0.896682
8,9,0.145316,0.370958
9,10,0.144097,0.37503


### Correlation for summed embeddings

In [30]:
selfsim_summed_ise1 = pd.read_csv('self_sim_summed_LatinISE1.csv', delimiter=',').sort_values('word')

In [31]:
selfsim_summed_ise12 = pd.read_csv('self_sim_summed_LatinISE2.csv', delimiter=',').sort_values('word')

In [32]:
def get_substraction_abs2(w):
    
    x1 = selfsim_summed_ise1.loc[selfsim_summed_ise1.word == w, 'self-sim'].values[0]
    x2 = selfsim_summed_ise12.loc[selfsim_summed_ise12.word == w, 'self-sim'].values[0]
    
    return np.absolute(x1-x2)

In [33]:
selfsim_summed_ise1.head()

Unnamed: 0,word,instances,self-sim
20,acerbus,148,0.846152
36,adsumo,45,0.835565
33,ancilla,96,0.891895
23,beatus,342,0.829985
11,civitas,1691,0.852423


In [34]:
selfsim_summed_ise12.head()

Unnamed: 0,word,instances,self-sim
39,acerbus,320,0.846981
37,adsumo,264,0.819919
22,ancilla,464,0.85037
8,beatus,2627,0.802642
14,civitas,5752,0.838618


In [35]:
get_substraction_abs2('acerbus')

0.0008286234304590856

In [36]:
substractions = {}

for word in list(selfsim_summed_ise1['word']):
    substractions[word] = get_substraction_abs2(word)

In [37]:
substractions

{'acerbus': 0.0008286234304590856,
 'adsumo': 0.015645836557359405,
 'ancilla': 0.0415247700791902,
 'beatus': 0.02734298178714667,
 'civitas': 0.01380507296754041,
 'cohors': 0.003822309421332215,
 'consilium': 0.007596528051735052,
 'consul': 0.00704980244296638,
 'credo': 0.0064786744976483845,
 'dolus': 0.005949659239322558,
 'dubius': 0.006007921586975273,
 'dux': 0.006928159073935403,
 'fidelis': 0.010772407793354644,
 'honor': 0.015150345337751059,
 'hostis': 0.00489948581310895,
 'humanitas': 0.00024175187461117353,
 'imperator': 0.01106107111187049,
 'itero': 0.008211021136966368,
 'jus': 0.012787021839748336,
 'licet': 0.005635592880370743,
 'necessarius': 0.0031452078042232268,
 'nepos': 0.01600149755891167,
 'nobilitas': 0.001832793224290974,
 'oportet': 0.005552724701632705,
 'poena': 0.004482071561759726,
 'pontifex': 0.005001773341193783,
 'potestas': 0.008071186708307976,
 'regnum': 0.014113483410196581,
 'sacramentum': 0.01024790253192831,
 'salus': 0.00011971377631336

In [38]:
stats.spearmanr(golden, list(substractions.values()))

SpearmanrResult(correlation=0.04184061170698901, pvalue=0.7976850288606296)