In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import csv
from scipy.stats import pearsonr
import glob

In [111]:
def get_domain_df(data_csv):
    data = []
    with open(data_csv, 'r') as file:
        csv_reader = csv.reader(file, delimiter=',')
        for row in csv_reader:
            if row[0] == '':
                continue
            l1 = row[1]
            l2 = row[2]
            alignment = float(row[3])
            w1 = row[4]
            w2 = row[5]
            domain = row[16]
            data.append(dict(l1=l1, 
                             l2=l2, 
                             alignment=alignment, 
                             w1=w1, 
                             w2=w2, 
                             domain=domain))

    lin_data = []
    files = glob.glob('../compute-alignment/w=*')
    for fn in files:
        with open(fn, 'r') as file:
            langs = fn.replace('../compute-alignment/w=', '').split('.')[0].split('-')
            l1, l2 = langs
            csv_reader = csv.reader(file, delimiter=',')
            csv_reader = list(csv_reader)
            n_rows = len(csv_reader)
            if n_rows > 600:
                for row in csv_reader:
                    if row[0] == 'l1':
                        continue
                    w1 = row[2]
                    w2 = row[3]
                    alignment = float(row[4])
                    lin_data.append(dict(l1=l1, 
                                         l2=l2,
                                         w1=w1,
                                         w2=w2,
                                         linear_alignment=alignment))

    lin_df = pd.DataFrame(lin_data)

    df = pd.DataFrame(data)

    df = pd.merge(df, lin_df, how='left', on=['l1', 'l2', 'w1', 'w2'])
    df = df[df['linear_alignment'].notnull()]
#     df = df.groupby('domain').mean()
#     df = df.sort_values(by='alignment')

#     df = df.sort_values(by='linear_alignment')
    
    return df

lang_map = dict()
with open('data/distances/FAIR_languages_glotto_xdid.csv', 'r') as file:
    csv_reader = csv.reader(file, delimiter=',')
    for row in csv_reader: 
        if row[0] == 'Language':
            continue
        full = row[0]
        short = row[8]
        lang_map[short] = full

In [112]:
df = get_domain_df('../alignments-nel-wiki-trl.csv')

In [113]:
df_eng = df[df['l1']=='en']
df_eng_quantities = df_eng[df_eng['domain']=='The house'].groupby(['l1', 'l2']).mean().reset_index()
df_eng_quantities['l1'] = df_eng_quantities['l1'].map(lambda x: lang_map[x])
df_eng_quantities['l2'] = df_eng_quantities['l2'].map(lambda x: lang_map[x])
# df_eng_quantities.columns = ['_'.join(col).strip() for col in df_eng_quantities.columns.values]

In [114]:
df_eng_quantities.sort_values(by='linear_alignment')

Unnamed: 0,l1,l2,alignment,linear_alignment
12,English,Japanese,0.117587,-0.003617
27,English,Chinese,0.152764,-0.002936
19,English,Mongolian,0.134489,-0.002764
17,English,Latin,0.108495,-0.002653
10,English,Irish,0.132601,-0.002651
22,English,Sakha,0.111795,-0.002634
14,English,Kazakh,0.201921,-0.002569
5,English,Chechen,0.048203,-0.002542
2,English,Bashkir,0.138917,-0.002541
0,English,Arabic,0.196431,-0.002522


In [115]:
df_eng_quantities.sort_values(by='alignment')

Unnamed: 0,l1,l2,alignment,linear_alignment
5,English,Chechen,0.048203,-0.002542
17,English,Latin,0.108495,-0.002653
6,English,Welsh,0.108798,-0.002471
22,English,Sakha,0.111795,-0.002634
12,English,Japanese,0.117587,-0.003617
10,English,Irish,0.132601,-0.002651
19,English,Mongolian,0.134489,-0.002764
2,English,Bashkir,0.138917,-0.002541
18,English,Malayalam,0.140736,-0.002397
24,English,Telugu,0.143263,-0.002352


In [116]:
df_eng = df[df['l1']=='en']
df_eng_quantities = df_eng.groupby(['l1', 'l2']).mean().reset_index()
df_eng_quantities['l1'] = df_eng_quantities['l1'].map(lambda x: lang_map[x])
df_eng_quantities['l2'] = df_eng_quantities['l2'].map(lambda x: lang_map[x])

In [132]:
df_eng = df_eng_quantities.sort_values(by='linear_alignment')
df_eng = df_eng.rename(columns={'linear_alignment': 'Orthogonal Alignment', 'alignment': 'Nearest Neighbors Alignment', 'l2': 'Language'})
print(df_eng[['Language', 'Orthogonal Alignment']][::-1].to_latex(index=False))

\begin{tabular}{lr}
\toprule
    Language &  Orthogonal Alignment \\
\midrule
     Spanish &             -0.002012 \\
     Italian &             -0.002064 \\
      French &             -0.002076 \\
  Portuguese &             -0.002098 \\
     Catalan &             -0.002110 \\
      Telugu &             -0.002121 \\
       Tamil &             -0.002203 \\
   Malayalam &             -0.002225 \\
     Kannada &             -0.002228 \\
      Breton &             -0.002244 \\
      Korean &             -0.002254 \\
     Turkish &             -0.002332 \\
 Azerbaijani &             -0.002333 \\
    Georgian &             -0.002338 \\
    Romanian &             -0.002360 \\
     Bashkir &             -0.002389 \\
      Basque &             -0.002395 \\
       Tatar &             -0.002422 \\
       Welsh &             -0.002422 \\
      Arabic &             -0.002466 \\
       Latin &             -0.002511 \\
     Chechen &             -0.002513 \\
       Sakha &             -0.002522 \\
  

In [131]:
df_eng = df_eng_quantities.sort_values(by='alignment')
df_eng = df_eng.rename(columns={'linear_alignment': 'Orthogonal Alignment', 'alignment': 'Nearest Neighbors Alignment', 'l2': 'Language'})
print(df_eng[['Language', 'Nearest Neighbors Alignment']][::-1].to_latex(index=False))

\begin{tabular}{lr}
\toprule
    Language &  Nearest Neighbors Alignment \\
\midrule
     Spanish &                     0.380316 \\
     Italian &                     0.357891 \\
  Portuguese &                     0.356459 \\
      French &                     0.353356 \\
     Catalan &                     0.345294 \\
     Turkish &                     0.324510 \\
    Romanian &                     0.298681 \\
      Basque &                     0.294299 \\
       Tamil &                     0.293017 \\
    Georgian &                     0.291078 \\
 Azerbaijani &                     0.283093 \\
     Kannada &                     0.278220 \\
      Arabic &                     0.277972 \\
   Malayalam &                     0.271179 \\
      Telugu &                     0.265991 \\
      Kazakh &                     0.264559 \\
      Korean &                     0.246613 \\
      Breton &                     0.239520 \\
     Chinese &                     0.230831 \\
       Welsh &        

In [120]:
df_eng

Unnamed: 0,l1,l2,alignment,linear_alignment
12,English,Japanese,0.22601,-0.003518
27,English,Chinese,0.230831,-0.002804
19,English,Mongolian,0.224684,-0.002575
10,English,Irish,0.199239,-0.002564
14,English,Kazakh,0.264559,-0.002527
22,English,Sakha,0.171614,-0.002522
5,English,Chechen,0.147436,-0.002513
17,English,Latin,0.192814,-0.002511
0,English,Arabic,0.277972,-0.002466
6,English,Welsh,0.226334,-0.002422


In [145]:
def get_sorted(domain):
    df_eng = df[df['l1']=='en']
    df_eng_quantities = df_eng[df_eng['domain']==domain].groupby(['l1', 'l2']).mean().reset_index()
    df_eng_quantities['l1'] = df_eng_quantities['l1'].map(lambda x: lang_map[x])
    df_eng_quantities['l2'] = df_eng_quantities['l2'].map(lambda x: lang_map[x])
    return df_eng_quantities.sort_values(by='linear_alignment')

domains = df.groupby('domain').mean().reset_index()['domain'].to_numpy()
[(d, get_sorted(d)['l2'].to_numpy()[-2:]) for d in domains]

[('Agriculture and vegetation', array(['Spanish', 'French'], dtype=object)),
 ('Animals', array(['Italian', 'Spanish'], dtype=object)),
 ('Basic actions and technology', array(['Spanish', 'Telugu'], dtype=object)),
 ('Clothing and grooming', array(['French', 'Spanish'], dtype=object)),
 ('Cognition', array(['Catalan', 'Spanish'], dtype=object)),
 ('Emotions and values', array(['Spanish', 'Telugu'], dtype=object)),
 ('Food and drink', array(['French', 'Italian'], dtype=object)),
 ('Kinship', array(['Italian', 'Spanish'], dtype=object)),
 ('Miscellaneous function words',
  array(['Portuguese', 'Spanish'], dtype=object)),
 ('Modern world', array(['Portuguese', 'Tamil'], dtype=object)),
 ('Motion', array(['Italian', 'Telugu'], dtype=object)),
 ('Possession', array(['Spanish', 'French'], dtype=object)),
 ('Quantity', array(['French', 'Spanish'], dtype=object)),
 ('Sense perception', array(['Spanish', 'Telugu'], dtype=object)),
 ('Social and political relations',
  array(['Portuguese', 'Ital

In [143]:
[(d, get_sorted(d)['l2'].to_numpy()[:2]) for d in domains]

[('Agriculture and vegetation', array(['Japanese', 'Chinese'], dtype=object)),
 ('Animals', array(['Japanese', 'Chinese'], dtype=object)),
 ('Basic actions and technology',
  array(['Japanese', 'Chinese'], dtype=object)),
 ('Clothing and grooming', array(['Japanese', 'Latin'], dtype=object)),
 ('Cognition', array(['Japanese', 'Chinese'], dtype=object)),
 ('Emotions and values', array(['Japanese', 'Irish'], dtype=object)),
 ('Food and drink', array(['Japanese', 'Chinese'], dtype=object)),
 ('Kinship', array(['Japanese', 'Chinese'], dtype=object)),
 ('Miscellaneous function words', array(['Japanese', 'Kazakh'], dtype=object)),
 ('Modern world', array(['Japanese', 'Chinese'], dtype=object)),
 ('Motion', array(['Japanese', 'Sakha'], dtype=object)),
 ('Possession', array(['Japanese', 'Chinese'], dtype=object)),
 ('Quantity', array(['Japanese', 'Mongolian'], dtype=object)),
 ('Sense perception', array(['Japanese', 'Chinese'], dtype=object)),
 ('Social and political relations',
  array(['Japa

In [166]:
def get_ranks(domain):
    df_eng = df[df['l1']=='en']
    if domain is None:
        df_eng_quantities = df_eng.groupby(['l1', 'l2']).mean().reset_index()
    else:
        df_eng_quantities = df_eng[df_eng['domain']==domain].groupby(['l1', 'l2']).mean().reset_index()
    df_eng_quantities['l1'] = df_eng_quantities['l1'].map(lambda x: lang_map[x])
    df_eng_quantities['l2'] = df_eng_quantities['l2'].map(lambda x: lang_map[x])
    return df_eng_quantities.sort_values(by='linear_alignment').reset_index().reset_index()[['level_0', 'l2']].sort_values(by='l2')['level_0'].to_numpy()
ranks = [(d, get_ranks(d)) for d in domains]

In [155]:
overall_rank = get_ranks(None)

In [156]:
ranks_diff = [(d, rank - overall_rank) for d, rank in ranks]

In [165]:
ranks_diff

[('Agriculture and vegetation',
  array([-1,  0, -2,  1, -2,  1, -3,  0,  2,  0,  1, -1,  0,  2,  2,  1, -2,
         -1,  9, -4,  0, -3, -1, -4, -2,  1,  6,  0])),
 ('Animals',
  array([ 2, -1,  1,  0, -3, -3, -4,  0, -1, -2,  2,  0,  0,  2,  4, -1, -4,
         -1,  5, -1,  5, -1,  0,  1, -1,  3,  1, -3])),
 ('Basic actions and technology',
  array([-4, -2, -1, -2,  4,  1, -1,  0, -4,  2, -1, -1,  0,  0,  4,  1,  0,
         -3,  1, -1, -3,  1, -1, -7,  2,  5,  4,  6])),
 ('Clothing and grooming',
  array([10,  0, -5, -1,  1,  1,  0,  1,  1, -3,  0, -1,  0, -6, 12,  5, -6,
          1,  3, -1,  1, -1,  0, -4, -1, -2, -4, -1])),
 ('Cognition',
  array([ -6,   0,  -1,  -2,   4,   3,   7,   0,   0,  -2,   2,  -2,   0,
          -5,  -1,   3,   0,  -4,   2,  -1,   8,   3,   0, -15,   0,  -4,
           3,   8])),
 ('Emotions and values',
  array([-1,  0, -4, -6, -4, -2,  7,  1, -2, -2, -2, -1,  0,  0,  2,  0, -4,
          0,  2,  0, -4, 13, -1,  1,  6,  5, -5,  1])),
 ('Food and drink',

In [169]:
def get_langs():
    df_eng = df[df['l1']=='en']
    df_eng_quantities = df_eng.groupby(['l1', 'l2']).mean().reset_index()
    df_eng_quantities['l1'] = df_eng_quantities['l1'].map(lambda x: lang_map[x])
    df_eng_quantities['l2'] = df_eng_quantities['l2'].map(lambda x: lang_map[x])
    return df_eng_quantities.sort_values(by='l2')['l2'].to_numpy()
sorted_langs = list(get_langs())

In [184]:
lang_id = 7
print(f'Summary of {sorted_langs[lang_id]} differences with English')

for i, domain in enumerate(domains):
    t, rd = ranks_diff[i]
    attr = rd[lang_id]
    print(t, attr)

Summary of Chinese differences with English
Agriculture and vegetation 0
Animals 0
Basic actions and technology 0
Clothing and grooming 1
Cognition 0
Emotions and values 1
Food and drink 0
Kinship 0
Miscellaneous function words 1
Modern world 0
Motion 2
Possession 0
Quantity 1
Sense perception 0
Social and political relations 0
Spatial relations 0
Speech and language 0
The body 0
The house 0
The physical world 0
Time 0


In [182]:
list(enumerate(sorted_langs))

[(0, 'Arabic'),
 (1, 'Azerbaijani'),
 (2, 'Bashkir'),
 (3, 'Basque'),
 (4, 'Breton'),
 (5, 'Catalan'),
 (6, 'Chechen'),
 (7, 'Chinese'),
 (8, 'French'),
 (9, 'Georgian'),
 (10, 'Irish'),
 (11, 'Italian'),
 (12, 'Japanese'),
 (13, 'Kannada'),
 (14, 'Kazakh'),
 (15, 'Korean'),
 (16, 'Latin'),
 (17, 'Malayalam'),
 (18, 'Mongolian'),
 (19, 'Portuguese'),
 (20, 'Romanian'),
 (21, 'Sakha'),
 (22, 'Spanish'),
 (23, 'Tamil'),
 (24, 'Tatar'),
 (25, 'Telugu'),
 (26, 'Turkish'),
 (27, 'Welsh')]