# DOE: Features

##### Note: This notebook just generates potential experiment groups of different sizes (k) - final experiment groups were chosen manually (see excel sheet)

#### Description:

1) Read in language vectors and their dataset abbreviation

2) Remove investigated feature from the dataframe and find k closest vectors for feature

3) Create overview over all features for k

4) Generate for each k a csv-file -> experiment groups are chosen manually in excel

5) Create scripts per feature from manual choice (csv-file)

In [1]:
import pandas as pd
import sklearn.neighbors as nei

#### 1) Read in language vectors and corresponding Wikiann data

In [2]:
lan_EuropeanSprachbund_raw = pd.read_csv('overview_EuropeanSprachbund.csv', sep=';')

#remove empty lines at the end of the dataframe
lan_EuropeanSprachbund_raw.drop([38,39], inplace=True)

# remove languages with dataset < 100T lines
print('len before removing lan < 100T lines: ' + str(len(lan_EuropeanSprachbund_raw)))

languages_with_less_100Tlines = ["Ice","Ir","Gae","Kom","Mlt","Srd","Udm"]
print('len lan < 100T lines: ' + str(len(languages_with_less_100Tlines)))
indices = [i for i in range(0,len(lan_EuropeanSprachbund_raw.index)) if lan_EuropeanSprachbund_raw['language'][i] not in languages_with_less_100Tlines]
lan_EuropeanSprachbund = lan_EuropeanSprachbund_raw.iloc[indices].reset_index()

print('len after removing lan < 100T lines: ' + str(len(lan_EuropeanSprachbund)))

lan_EuropeanSprachbund.head()

len before removing lan < 100T lines: 38
len lan < 100T lines: 7
len after removing lan < 100T lines: 31


Unnamed: 0,level_0,index,language,Wikiann,script,Member in European Sprachbund,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0.0,Fr,fr,Latin,True,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,1.0,Grm,de,Latin,True,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
2,2,2.0,Dut,nl,Latin,True,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
3,3,3.0,Eng,en,Latin,True,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
4,4,4.0,Grk,el,Greek,True,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0


#### 2) Remove investigated feature from dataframe and find k closest vectors for feature

In [3]:
def remove_feature_from_df(df_raw, featurenumber):
    
    df = df_raw[df_raw.columns[6:18]]
    df = df.drop([str(featurenumber)],axis=1)
    
    return df

In [4]:
# calculate average distance of several distances
def average( a , n ): 
  
    # Find sum of array element 
    sum = 0
    for i in range(n): 
        sum += a[i] 
      
    return sum/n; 

def average_dist(dis):
    return average(dis[0], len(dis[0]))


# calculates closest k vectors for a feature
def get_k_closest_languages_per_feature(df_raw, featurenumber, k):
    
    df = remove_feature_from_df(df_raw, featurenumber)

    distanceMetric = nei.DistanceMetric.get_metric('euclidean')
    matrix = df.to_numpy()
    tree = nei.KDTree(matrix, metric=distanceMetric)

    # calculate for each vector average distance to k closest vectors
    avg_distances_per_lan = {}
    
    for index in range(0,(len(matrix))):
        dist, ind = tree.query(matrix[index:(index+1)], k)
        avg_distances_per_lan[index] = average_dist(dist)
    
    # language with k closest neighbors
    
    lan_with_closest_k_neighbors = pd.DataFrame.from_dict(avg_distances_per_lan, orient='index', columns=['average_dist']).sort_values(by=['average_dist']).index[0]
        #print(lan_with_closest_k_neighbors)
    vec_lan_with_k_closest_neighbors = matrix[lan_with_closest_k_neighbors:(lan_with_closest_k_neighbors+1)]
        #print(df.iloc[lan_with_closest_k_neighbors])
        #print(vec_lan_with_k_closest_neighbors)
    
    dist, ind = tree.query(vec_lan_with_k_closest_neighbors, k)
    
        #print(ind)
        #print(dist)
    
     # k closest languages
    list = []
    for i in ind[0]:
        list.append(df_raw['language'].iloc[i])
        
    #print(list)
    
    
    # filter df for closest vectors    
    Filter_df_raw  = df_raw[df_raw['language'].isin(list)]
        #print(Filter_df_raw)
    
   
    # create overview for feature with needed attributes
    df_feat = pd.DataFrame(columns=['feature','average_dist_over_feat','language','index_org','Hasfeature','dist'])
    list_of_Series = []
    
    for lan in Filter_df_raw.index:
        
        lang = Filter_df_raw['language'][lan]
        
        Hasfeature = False
        if Filter_df_raw[str(featurenumber)][lan] == 1:
            Hasfeature = True
            
        #distance of language-pair
        for index, item in enumerate(ind[0]):
            if item == lan:
                distance_lan = dist[0][index]
                
        list_of_Series += [pd.Series([featurenumber, average_dist(dist), lang, lan, Hasfeature, distance_lan], index=df_feat.columns)]

    df_feat_added = df_feat.append(list_of_Series, ignore_index=True)
    
    return Filter_df_raw, df_feat_added

#### 3) Create overview over all features for k

In [5]:
# creating dataframe and csv for all features
def overview_groups_all_features(df_raw, k):

    frames_per_feature = []

    for feat in range(1,(len(df_raw.columns[6:18])+1)):
        _, df_per_feat = get_k_closest_languages_per_feature(df_raw, feat, k)
        frames_per_feature += [df_per_feat]

    doe_features = pd.concat(frames_per_feature)
    
    return doe_features

#### 4) Generate for each k a csv-file

In [6]:
def to_csv(df_raw, k):
    
    df = overview_groups_all_features(df_raw, k)
    
    df.to_csv('DOE_typFeatures_'+ str(k)+'.csv', index=False)

#### Execution : ___main___ 4)

In [8]:
k1 = 10
to_csv(lan_EuropeanSprachbund, k1)

k2 = 15
to_csv(lan_EuropeanSprachbund, k2)

k3 = 20
to_csv(lan_EuropeanSprachbund, k3)

k4 = 25
to_csv(lan_EuropeanSprachbund, k4)

#### 5) Generate experiment script files: Not done yet (phze)

In [12]:
# from language similarity
def generate_string_dataset(wikiann_abre):
    return 'languages/' + wikiann_abre + '/dataset'

In [57]:
# from language similarity

def generate_script_file(df_raw, d):
    
    df = dist_filter(df_raw, d)
    number_of_experiments = df['DIST'].value_counts().sort_index()[d]
    
    file = open('run_transformers.sh','w')
    
    file.write('# Experiments for distance: ' + str(d) + '\n')
    file.write('\n')
    file.write('# Experiment design: \n')
    file.write('# (Finetuned language, Evaluated language) - Split(80/20)\n')
    file.write('# Number of language pairs: ' + str(number_of_experiments) + '\n')
    file.write('\n')
    file.write('# @author: jopo, phze \n')
    file.write('# @date: ' + str(datetime.datetime.now()) + '\n')
    file.write('\n')
    file.write('\n')

    for index, row in df.iterrows():
        file.write ('# ' + str(index) + ' - Language pair: ' + '(' + row['LAN_1'] + ',' + row['LAN_2'] + ')\n')
        
        #creates test-set '4350' 20% -> dataset is of '17003' sentences = 80% training set
        file.write('python3 preprocessing/take_sentences.py \'' + generate_string_dataset(row['lan2_Wiki']) + '\'' + ' \'languages/' + row['lan2_Wiki'] + '/dataset_eval.txt\'' ' \'4350\'\n')
        #runs language pair
        file.write('python3 SimpleTransformers.py \'' + generate_string_dataset(row['lan1_Wiki']) + '\'' + ' \'languages/' + str(d) + '/dataset_eval.txt\' ' + '\'results/output_' + str(d) + '/' + row['LAN_1'] + '_' + row['LAN_2'] + '\'\n')
        file.write('\n')
        file.write('\n')

    file.closed

#### Execution : ___main___

In [58]:
df_euclidean_dist = calculate_distances(lan_EuropeanSprachbund)
df_euclidean_dist
print(df_euclidean_dist.head())

print()

number_of_experiments = df_euclidean_dist['DIST'].value_counts().sort_index()
print(number_of_experiments)

     DIST LAN_1 lan1_MembES lan1_Wiki LAN_2 lan2_MembES lan2_Wiki
0     0.0    Fr        True        fr    Fr        True        fr
448   0.0    Cz        True        cs    Cz        True        cs
416   0.0   Hng        True        hu   Hng        True        hu
384   0.0   Swd        True        sv   Swd        True        sv
383   0.0   Swd        True        sv   Nor        True        no

0.000000     49
1.000000     42
1.414214     80
1.732051    100
2.000000    132
2.236068    154
2.449490    126
2.645751    102
2.828427     72
3.000000     58
3.162278     32
3.316625     12
3.464102      2
Name: DIST, dtype: int64


In [59]:
#example 
d1 = 0.0

generate_script_file(df_euclidean_dist, d1)