# DOE: Language Similarity

#### Description:

1) Read in language vectors and their dataset abbreviation

2) Calculate Euclidean distances

3) Filter for closest and not closest language pairs with path to dataset files

4) Generate script file

In [1]:
import pandas as pd
from scipy.spatial import distance_matrix
import datetime

#### 1) Read in language vectors and corresponding Wikiann data

In [2]:
lan_EuropeanSprachbund_raw = pd.read_csv('../overview_EuropeanSprachbund.csv', sep=';')

# remove languages with dataset < 100T lines
print('len before removing lan < 100T lines: ' + str(len(lan_EuropeanSprachbund_raw)))

languages_with_less_100Tlines = ["Ice","Ir","Gae","Kom","Mlt","Srd","Udm"]
print('len lan < 100T lines: ' + str(len(languages_with_less_100Tlines)))
indices = [i for i in range(0,len(lan_EuropeanSprachbund_raw.index)) if lan_EuropeanSprachbund_raw['language'][i] not in languages_with_less_100Tlines]
lan_EuropeanSprachbund = lan_EuropeanSprachbund_raw.iloc[indices].reset_index()

print('len after removing lan < 100T lines: ' + str(len(lan_EuropeanSprachbund)))

#lan_EuropeanSprachbund[lan_EuropeanSprachbund.columns[6:18]][['5','8','10','12']]

len before removing lan < 100T lines: 38
len lan < 100T lines: 7
len after removing lan < 100T lines: 31


In [4]:
print(lan_EuropeanSprachbund_raw[['language','1','2','3','4','5','6','7','8','9','10','11','12']].to_latex(index=False))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
language &  1 &  2 &  3 &  4 &  5 &  6 &  7 &  8 &  9 &  10 &  11 &  12 \\
\midrule
      Fr &  1 &  1 &  1 &  1 &  1 &  1 &  1 &  1 &  1 &   1 &   1 &   1 \\
     Grm &  1 &  1 &  1 &  1 &  1 &  1 &  1 &  1 &  0 &   1 &   1 &   1 \\
     Dut &  1 &  1 &  1 &  1 &  1 &  0 &  1 &  1 &  1 &   1 &   1 &   0 \\
     Eng &  1 &  1 &  1 &  1 &  1 &  1 &  0 &  1 &  1 &   1 &   1 &   0 \\
     Grk &  1 &  1 &  1 &  1 &  1 &  1 &  1 &  0 &  1 &   1 &   0 &   1 \\
     Spn &  1 &  1 &  1 &  1 &  1 &  0 &  1 &  1 &  0 &   1 &   0 &   1 \\
     Prt &  1 &  1 &  1 &  1 &  1 &  0 &  1 &  1 &  0 &   1 &   0 &   1 \\
      It &  1 &  1 &  1 &  1 &  1 &  0 &  1 &  1 &  0 &   1 &   0 &   1 \\
     Alb &  1 &  1 &  1 &  0 &  1 &  0 &  1 &  1 &  1 &   1 &   0 &   1 \\
     Srd &  1 &  1 &  1 &  0 &  1 &  0 &  1 &  1 &  0 &   1 &   0 &   1 \\
     Rom &  1 &  1 &  1 &  0 &  1 &  1 &  1 &  0 &  0 &   1 &   0 &   1 \\
     Rus &  0 &  1 &  0 &  0 &  1 &  1 &  1 &  0 & 

#### 2) Calculate Euclidean distances

In [14]:
# calculates all the distances between the language vectors
def calculate_distances(df):

    matrix = df[df.columns[6:18]].to_numpy()
    #two-dimensional array of distances between all language pairs
    distances = distance_matrix(matrix,matrix) 

    df_new = pd.DataFrame(columns=['DIST','LAN_1','lan1_MembES','lan1_Wiki','LAN_2','lan2_MembES','lan2_Wiki'])
    list_ofSeries = []

    for i in range(0,len(distances)): 
        for j in range(0,len(distances[i])):
            
            list_ofSeries += [pd.Series([distances[i][j],lan_EuropeanSprachbund['language'][i], lan_EuropeanSprachbund['Member in European Sprachbund'][i],
                                         lan_EuropeanSprachbund['Wikiann'][i],lan_EuropeanSprachbund['language'][j],
                                         lan_EuropeanSprachbund['Member in European Sprachbund'][j], lan_EuropeanSprachbund['Wikiann'][j]], index=df_new.columns)]
    Df_distances = df_new.append(list_ofSeries, ignore_index=True).sort_values(by=['DIST'])
    
    return Df_distances

#### 3) Filter for closest and not closest language pairs with path to dataset files

In [15]:
# filters dataframe for given distance
def dist_filter(df, dist):
    
    # filter out monolingual results
    df = df[df['LAN_1'] != df['LAN_2']]
    
    #reduce the number of experiments to 40
    return df[df['DIST'] == dist][:40]

In [16]:
#datasets needed for the experiment - just done once
# returns Wikiann dataset abbreviations

def used_datasets(df):
    
    LAN1 = df['lan1_Wiki'].unique()
    LAN2 = df['lan2_Wiki'].unique()
    
    datasets= list(set(LAN1) | set(LAN2))
    
    return datasets    

#### 4) Generate script file

In [17]:
# generate path string for language
def generate_string_dataset(wikiann_abre):
    return '"languages/' + wikiann_abre + '/dataset_$j.txt"'

In [18]:
#list of datasets with paths on server
def used_datasets_paths(datasets):
    
    datasets_paths = []
    
    for i in range(0,len(datasets)):
        datasets_paths + [generate_string_dataset(datasets[i])]
    
    return datasets_paths

In [19]:
def generate_script_file(df_raw, d):
    
    df = dist_filter(df_raw, d)
    number_of_experiments = df['DIST'].value_counts().sort_index()[d]
    
    used_datasets_with_paths = used_datasets_paths(used_datasets(df))
    
    file = open('run_transformers_' + str(round(d,2)) + '.sh','w')
    
    file.write('# Experiments for distance: ' + str(d) + '\n')
    file.write('\n')
    file.write('# Experiment design: \n')
    file.write('# (Finetuned language, Evaluated language) - Split(80/20)\n')
    file.write('# Number of language pairs: ' + str(number_of_experiments) + '\n')
    file.write('\n')
    file.write('# @author: jopo, phze \n')
    file.write('# @date: ' + str(datetime.datetime.now()) + '\n')
    file.write('\n')
    file.write('\n')
    file.write('#!/bin/bash\n')
    file.write('\n')
    file.write('# Cross-validation on different subset-reduction of datasets (preprocessed):\n')
    file.write(' for j in {1..4}\n')
    file.write(' do\n')
    file.write('\n')
    file.write('\n')
    
    # running experiments for language pairs
    file.write('# Experiments:\n')
    for index, row in df.iterrows():
        file.write ('# ' + str(index) + ' - Language pair: ' + '(' + row.LAN_1 + ',' + row.LAN_2 + ')\n')          
        
        #creates test-set '4350' 20% -> dataset is of '17003' sentences = 80% training set
        
        # python3 take_sentences.py wikiann-en.bio train-en.bio test-en.bio 0 
        file.write('    python3 preprocessing/take_sentences.py ' + generate_string_dataset(row.lan1_Wiki) + ' "languages/' + row.lan1_Wiki + '/dataset_train.txt"' + ' "languages/' + row.lan1_Wiki + '/dataset_eval.txt"' ' \"0"\n')
        file.write('    python3 preprocessing/take_sentences.py ' + generate_string_dataset(row.lan2_Wiki) + ' "languages/' + row.lan2_Wiki + '/dataset_train.txt"' + ' "languages/' + row.lan2_Wiki + '/dataset_eval.txt"' ' \"0"\n')
        #runs language pair
        file.write('    python3 SimpleTransformers.py ' + '"languages/' + row.lan1_Wiki + '/dataset_train.txt"' + ' "languages/' + row.lan2_Wiki + '/dataset_eval.txt" ' + '"results_filt/results_' + str(round(d,2)) + '/output_' + str(round(d,2)) + '/' + row.LAN_1 + '_' + row.LAN_2 + '_' + '$j' + '"\n')
        file.write('\n')
    
    file.write(' done')

    file.closed
    
    return df

#### Execution : ___main___

In [20]:
df_euclidean_dist = calculate_distances(lan_EuropeanSprachbund)

number_of_experiments = df_euclidean_dist['DIST'].value_counts().sort_index()
print(number_of_experiments)

0.000000     49
1.000000     42
1.414214     80
1.732051    100
2.000000    132
2.236068    154
2.449490    126
2.645751    102
2.828427     72
3.000000     58
3.162278     32
3.316625     12
3.464102      2
Name: DIST, dtype: int64


In [21]:
number_of_experiments.index[3]

1.7320508075688772

In [22]:
# Number of features not in common:
#d1 = number_of_experiments.index[5]
#print('d1 ' + str(round(d1,2)) + ' :' + str(d1 ** 2))

#d2 = number_of_experiments.index[0]
#print('d2 ' + str(round(d2,2)) + ' :' + str(d2 ** 2))

#d3 = number_of_experiments.index[2]
#print('d3 ' + str(round(d3,2)) + ' :' + str(d3 ** 2))

#d4 = number_of_experiments.index[-3]
#print('d4 ' + str(round(d4,2)) + ' :' + str(d4 ** 2))

#d5 = number_of_experiments.index[-2]
#print('d5 ' + str(round(d5,2)) + ' :' + str(d5 ** 2))

#d6 = number_of_experiments.index[-1]
#print('d6 ' + str(round(d6,2)) + ' :' + str(d6 ** 2))

d7 = number_of_experiments.index[7]
print('d7 ' + str(round(d7,2)) + ' :' + str(d7 ** 2))

d7 2.65 :7.000000000000001


In [23]:
# design points of experiments: distances{d1 .. d5}

#print(d1)
#df_d1 = generate_script_file(df_euclidean_dist, d1)

#print(d2)
#df_d2 = generate_script_file(df_euclidean_dist, d2)

#print(d3)
#df_d3 = generate_script_file(df_euclidean_dist, d3)

#print(d4)
#generate_script_file(df_euclidean_dist, d4)

#print(d5)
#generate_script_file(df_euclidean_dist, d5)

#print(d6)
#df_d6 = generate_script_file(df_euclidean_dist, d6)

print(d7)
df_d7 = generate_script_file(df_euclidean_dist, d7)

#pd.concat[[df_d2,df_d3,df_d4]].to_csv('Designpoints_LanSim_filt.csv', index=False)

2.6457513110645907
