Notebook for exploring the ielex.tsv data in an interactive fashion using the functions in process_data.py

In [12]:
import process_data

dataset_path = 'data/ielex.tsv'

In [3]:
old_norse_iso_code = 'non'
# non: Old Norse, isl: Icelandic, fao: Faroese, swe: Swedish, dan: Danish, nor: Norwegian
nordic_iso_codes = {'isl', 'fao', 'swe', 'dan', 'nor'}

In [4]:
latin_iso_code = 'lat'
# lat: Latin, spa: Spanish, por: Portuguese, fra: French, ita: Italian
romance_iso_codes = {'spa', 'por', 'fra', 'ita'}

In [6]:
romance_cognate_pair_dicts = process_data.filter_subfamily(latin_iso_code, romance_iso_codes)

# for each daughter lang, count the number of attested cognates. We'll pick the top two, training the model on parent -> daughter_1 and benchmark performance on parent -> daughter_2
for lang in romance_cognate_pair_dicts:
    print(lang, ':', len(romance_cognate_pair_dicts[lang]))

por : 131
spa : 134
ita : 147
fra : 135


In [7]:
# sample some data to examine
for k in list(romance_cognate_pair_dicts['ita'])[:10]:
    print(romance_cognate_pair_dicts['ita'][k])

(['LATIN', 'lat', 'come', '1446', 'come', 'weˈniːre', 'come:B', 'w e ˈn iː r e', ''], ['ITALIAN', 'ita', 'come', '1446', 'come', 'venire', 'come:B', 'v e n i r e', ''])
(['LATIN', 'lat', 'snow', '784', 'snow', 'niks', 'snow:B', 'n i k s', ''], ['ITALIAN', 'ita', 'snow', '784', 'snow', 'neve', 'snow:B', 'n e v e', ''])
(['LATIN', 'lat', 'sing', '1261', 'sing', 'ˈkanere', 'sing:D', 'ˈk a n e r e', ''], ['ITALIAN', 'ita', 'sing', '1261', 'sing', 'kantare', 'sing:D', 'k a n t a r e', ''])
(['LATIN', 'lat', 'tail', '1220', 'tail', 'ˈkau̯da', 'tail:F', 'ˈk a u̯ d a', ''], ['ITALIAN', 'ita', 'tail', '1220', 'tail', 'koda', 'tail:F', 'k o d a', ''])
(['LATIN', 'lat', 'die', '1494', 'die', 'ˈmoriː', 'die:A', 'ˈm o r iː', ''], ['ITALIAN', 'ita', 'die', '1494', 'die', 'morire', 'die:A', 'm o r i r e', ''])
(['LATIN', 'lat', 'cold', '1287', 'cold', 'ˈfriːgidus', 'cold:E', 'ˈf r iː g i d u s', ''], ['ITALIAN', 'ita', 'cold', '1287', 'cold', 'freddo', 'cold:E', 'f r e dd o', ''])
(['LATIN', 'lat', '

In [8]:
nordic_cognate_pair_dicts = process_data.filter_subfamily(old_norse_iso_code, nordic_iso_codes)

for lang in nordic_cognate_pair_dicts:
    print(lang, ':', len(nordic_cognate_pair_dicts[lang]))

fao : 203
dan : 194
isl : 205
swe : 195
nor : 199


In [9]:
for k in list(nordic_cognate_pair_dicts['swe'])[:10]:
    print(nordic_cognate_pair_dicts['swe'][k])

(['OLD_NORSE', 'non', 'sharp', '1396', 'sharp', 'skarpr', 'sharp:B', 's k a r p r', ''], ['SWEDISH', 'swe', 'sharp', '1396', 'sharp', 'skarp', 'sharp:B', 's k a r p', ''])
(['OLD_NORSE', 'non', 'come', '1446', 'come', 'ˈkoma', 'come:B', 'ˈk o m a', ''], ['SWEDISH', 'swe', 'come', '1446', 'come', 'ˈkɔ̀mːa', 'come:B', 'ˈk ɔ̀ mː a', ''])
(['OLD_NORSE', 'non', 'yellow', '1424', 'yellow', 'ɡulr', 'yellow:A', 'ɡ u l r', ''], ['SWEDISH', 'swe', 'yellow', '1424', 'yellow', 'ɡʉːl', 'yellow:A', 'ɡ ʉː l', ''])
(['OLD_NORSE', 'non', 'snow', '784', 'snow', 'snjoːr', 'snow:B', 's n j oː r', ''], ['SWEDISH', 'swe', 'snow', '784', 'snow', 'snøː', 'snow:B', 's n øː', ''])
(['OLD_NORSE', 'non', 'sing', '1261', 'sing', 'ˈsynɣva', 'sing:B', 'ˈs y n ɣ v a', ''], ['SWEDISH', 'swe', 'sing', '1261', 'sing', 'ˈɧɵ̀ŋːa', 'sing:B', 'ˈɧ ɵ̀ ŋː a', ''])
(['OLD_NORSE', 'non', 'tail', '1220', 'tail', 'stertr', 'tail:B', 's t e r t r', ''], ['SWEDISH', 'swe', 'tail', '1220', 'tail', 'ɧæʈː', 'tail:B', 'ɧ æ ʈː', ''])
(['

In [11]:
file_path = 'Old_Norse-Icelandic Cognateses'

process_data.save_dataset(nordic_cognate_pair_dicts['isl'], file_path)