## Compute the mean distances between Swadesh words for each lang

(So that we can normalize the distances that the Numberbatch embeddings encode)

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import os

import joblib
import pandas as pd

import trglobals

In [15]:
# The main path where most of the Swadesh lists are saved in .csv format
swadesh_path = "../data01_Raw/Swadesh_Lists_wiki"

In [16]:
# The filename and column name for each language
lang_data = {
    "de": {'fname': "deu_Latn.csv", 'colname': "deu_Latn"},
    "el": {'fname': "ell_Grek.csv", 'colname': "ell"},
    "en": {'fname': "eng_Latn.csv", 'colname': "eng"},
    "it": {'fname': "ita_Latn.csv", 'colname': "ita"},
    "ru": {'fname': "rus_Cyrl.csv", 'colname': "rus_Cyrl"},
    "vi": {'fname': "vie_Latn.csv", 'colname': "vie"},
    "zh": {'fname': "cmn_Hans.csv", 'colname': "cmn"},
}

In [17]:
# This will map lang to the pd.Series for that lang
swadesh_cols = {}

In [18]:
for cur_lang_code, cur_data in lang_data.items():
    lang_fpath = os.path.join(swadesh_path, cur_data['fname'])
    lang_df = pd.read_csv(lang_fpath)
    lang_col = lang_df[cur_data['colname']]
    swadesh_cols[cur_lang_code] = lang_col

In [19]:
swadesh_cols.keys()

dict_keys(['de', 'el', 'en', 'it', 'ru', 'vi', 'zh'])

### And combine them into one final df

In [20]:
final_df = pd.concat(swadesh_cols, axis=1)

In [21]:
col_to_langcode = {data['colname']: langcode for langcode, data in lang_data.items()}

In [22]:
final_df.rename(columns=col_to_langcode, inplace=True)

In [23]:
final_df

Unnamed: 0,de,el,en,it,ru,vi,zh
0,ich,εγώ,I,io,я,tôi,我
1,du,εσύ,you,tu,ты,tôi,你
2,er,αυτός,he,lui,он,anh ấy,他
3,wir,εμείς,we,noi,мы,chúng ta,我们
4,ihr,εσείς,you,voi,вы,các bạn,你们
...,...,...,...,...,...,...,...
202,mit,με,with,con,с,với,和
203,und,και,and,e,и,và,和
204,wenn,εάν,if,se,если,nếu,如果
205,weil,επειδή,because,perché,потому что,vì,因为


In [24]:
final_fpath = "./lang_stats/swadesh_full.csv"
final_df.to_csv(final_fpath, index=False)
final_df.to_pickle(final_fpath.replace(".csv",".pkl"))
print(f"Saved to {final_fpath}")

Saved to ./lang_stats/swadesh_full.csv


In [25]:
def gen_pair_map(final_df, lang1, lang2):
    # Hmm... I guess it's most efficient to use dicts
    pair_recs = final_df[[lang1,lang2]].to_dict(orient='records')
    pair_dict = {rec[lang1].strip(): rec[lang2].strip() for rec in pair_recs}
    pkl_fpath = f"./lang_stats/swadesh_{lang1}_{lang2}.pkl"
    joblib.dump(pair_dict, pkl_fpath)
    return pkl_fpath

In [74]:
gen_pair_map(final_df, "de", "en")

'./lang_stats/swadesh_de_en.pkl'

In [75]:
gen_pair_map(final_df, "de", "zh")

'./lang_stats/swadesh_de_zh.pkl'

In [76]:
gen_pair_map(final_df, "de", "en")

'./lang_stats/swadesh_de_en.pkl'

In [77]:
gen_pair_map(final_df, "de", "vi")

'./lang_stats/swadesh_de_vi.pkl'

In [78]:
gen_pair_map(final_df, "de", "el")

'./lang_stats/swadesh_de_el.pkl'

Testing the trglobals implementation

In [24]:
result = trglobals.get_swadesh_map("de_ru")

In [None]:
en_mean = final_df['de_en_dist'].mean()

In [None]:
en_zh_ratio = en_mean / zh_mean
en_zh_ratio

In [None]:
en_zh_ratio * zh_mean

In [None]:
# Save for use in other files
stat_path = os.path.join(".","lang_stats")
if not os.path.isdir(stat_path):
    os.mkdir(stat_path)

In [None]:
ratio_fpath = os.path.join(stat_path, "en_zh_ratio.pkl")
joblib.dump(en_zh_ratio, ratio_fpath)

In [None]:
final_df

### Scratchwork, for tricky langs like Kazakh:

In [6]:
kk_fpath = "../data01_raw/swadesh_lists_other/Kazakh_github.tsv"

In [7]:
import pandas as pd

In [8]:
kk_df = pd.read_csv(kk_fpath, delimiter="\t")

In [11]:
kk_col = kk_df['kk_final']

In [12]:
kk_col.str.contains(",")

0      False
1      False
2      False
3      False
4      False
       ...  
202    False
203    False
204    False
205    False
206    False
Name: kk_final, Length: 207, dtype: bool

In [27]:
final_df['kk'] = kk_col

In [28]:
final_df

Unnamed: 0,de,el,en,it,ru,vi,zh,kk
0,ich,εγώ,I,io,я,tôi,我,мен
1,du,εσύ,you,tu,ты,tôi,你,сен
2,er,αυτός,he,lui,он,anh ấy,他,ол
3,wir,εμείς,we,noi,мы,chúng ta,我们,бiз
4,ihr,εσείς,you,voi,вы,các bạn,你们,сендер
...,...,...,...,...,...,...,...,...
202,mit,με,with,con,с,với,和,мен
203,und,και,and,e,и,và,和,және
204,wenn,εάν,if,se,если,nếu,如果,егер
205,weil,επειδή,because,perché,потому что,vì,因为,өйткенi


In [29]:
gen_pair_map(final_df, 'de', 'kk')

'./lang_stats/swadesh_de_kk.pkl'