code to parse the ielex data and build a dataset from an ancestral language and its descendants

In [1]:
import csv
import os
dataset_path = 'data/ielex.tsv'

In [2]:
old_norse_iso_code = 'non'
# non: Old Norse, isl: Icelandic, fao: Faroese, swe: Swedish, dan: Danish, nor: Norwegian
nordic_iso_codes = {'isl', 'fao', 'swe', 'dan', 'nor'}

In [3]:
latin_iso_code = 'lat'
# lat: Latin, spa: Spanish, por: Portuguese, fra: French, ita: Italian
romance_iso_codes = {'spa', 'por', 'fra', 'ita'}

In [4]:
def filter_subfamily(parent_lang_iso, daughter_iso_codes):
    '''
    parent_lang_iso: str, the iso code of the common ancestral language
    daughter_iso_codes: a set of str, the iso codes of the daughter languages of that parent

    returns a dictionary of the form {lang_iso_code: {global_id: (parent_line, daughter_line)}}, \
        which maps a daughter lang's iso code to a dict containing all cognate pairs in the dataset
        between the ancestor and that daughter lang.
    '''

    with open(dataset_path) as f:
        reader = csv.reader(f, delimiter='\t') # the file is a tsv (tab separated values)
        next(reader) # burn the header row

        # I assume that any given language, parent or daughter, only has one word per particular cognate class. Running code on ielex.tsv, at least, it seems this assumption is correct.
        parent_dict = {} # cognates in the parent lang. {global_id: {cognate class: line}}
        daughter_dict = {} # cognates in the daughter langs. {global_id: {cognate class: {lang_iso_code: line}}}

        # go through the file once and populate the above dicts
        for line in reader:
            # these are all the different entires a line could have
            language = line[0]
            iso_code = line[1]
            gloss = line[2]
            global_id = line[3]
            local_id = line[4]
            transcription = line[5]
            cognate_class = line[6]
            tokens = line[7]
            # notes = line[8]

            if iso_code == parent_lang_iso:
                if global_id not in parent_dict:
                    parent_dict[global_id] = {}
                if cognate_class not in parent_dict[global_id]:
                    parent_dict[global_id][cognate_class] = line
            elif iso_code in daughter_iso_codes:
                if global_id not in daughter_dict:
                    daughter_dict[global_id] = {}
                if cognate_class not in daughter_dict[global_id]:
                    daughter_dict[global_id][cognate_class] = {}
                daughter_dict[global_id][cognate_class][iso_code] = line

    print('identified', len(parent_dict), 'parent cognates')
    print('identified', len(daughter_dict), 'daughter cognates')

    cognate_pair_dicts = {iso_code: {} for iso_code in daughter_iso_codes} # map a language to a dictionary with particular cognates {lang_iso_code: {global_id: (parent_line, daughter_line)}}

    # identify cognates where they exist both in the parent lang and at least one daughter lang
    for global_id in parent_dict:
        if global_id in daughter_dict:
            for cognate_class in parent_dict[global_id]:
                if cognate_class in daughter_dict[global_id]:
                    parent_line = parent_dict[global_id][cognate_class]

                    for lang_iso_code in daughter_dict[global_id][cognate_class]:
                        daughter_line = daughter_dict[global_id][cognate_class][lang_iso_code]
                        cognate_pair_dicts[lang_iso_code][global_id] = (parent_line, daughter_line)
    
    return cognate_pair_dicts

In [5]:
romance_cognate_pair_dicts = filter_subfamily(latin_iso_code, romance_iso_codes)

# for each daughter lang, count the number of attested cognates. We'll pick the top two, training the model on parent -> daughter_1 and benchmark performance on parent -> daughter_2
for lang in romance_cognate_pair_dicts:
    print(lang, ':', len(romance_cognate_pair_dicts[lang]))

identified 202 parent cognates
identified 203 daughter cognates
fra : 135
ita : 147
por : 131
spa : 134


In [6]:
# sample some data to examine
for k in list(romance_cognate_pair_dicts['ita'])[:10]:
    print(romance_cognate_pair_dicts['ita'][k])

(['LATIN', 'lat', 'come', '1446', 'come', 'weˈniːre', 'come:B', 'w e ˈn iː r e', ''], ['ITALIAN', 'ita', 'come', '1446', 'come', 'venire', 'come:B', 'v e n i r e', ''])
(['LATIN', 'lat', 'snow', '784', 'snow', 'niks', 'snow:B', 'n i k s', ''], ['ITALIAN', 'ita', 'snow', '784', 'snow', 'neve', 'snow:B', 'n e v e', ''])
(['LATIN', 'lat', 'sing', '1261', 'sing', 'ˈkanere', 'sing:D', 'ˈk a n e r e', ''], ['ITALIAN', 'ita', 'sing', '1261', 'sing', 'kantare', 'sing:D', 'k a n t a r e', ''])
(['LATIN', 'lat', 'tail', '1220', 'tail', 'ˈkau̯da', 'tail:F', 'ˈk a u̯ d a', ''], ['ITALIAN', 'ita', 'tail', '1220', 'tail', 'koda', 'tail:F', 'k o d a', ''])
(['LATIN', 'lat', 'die', '1494', 'die', 'ˈmoriː', 'die:A', 'ˈm o r iː', ''], ['ITALIAN', 'ita', 'die', '1494', 'die', 'morire', 'die:A', 'm o r i r e', ''])
(['LATIN', 'lat', 'cold', '1287', 'cold', 'ˈfriːgidus', 'cold:E', 'ˈf r iː g i d u s', ''], ['ITALIAN', 'ita', 'cold', '1287', 'cold', 'freddo', 'cold:E', 'f r e dd o', ''])
(['LATIN', 'lat', '

In [6]:
nordic_cognate_pair_dicts = filter_subfamily(old_norse_iso_code, nordic_iso_codes)

for lang in nordic_cognate_pair_dicts:
    print(lang, ':', len(nordic_cognate_pair_dicts[lang]))

identified 207 parent cognates
identified 207 daughter cognates
nor : 199
isl : 205
swe : 195
fao : 203
dan : 194


In [9]:
for k in list(nordic_cognate_pair_dicts['swe'])[:10]:
    print(nordic_cognate_pair_dicts['swe'][k])

(['OLD_NORSE', 'non', 'sharp', '1396', 'sharp', 'skarpr', 'sharp:B', 's k a r p r', ''], ['SWEDISH', 'swe', 'sharp', '1396', 'sharp', 'skarp', 'sharp:B', 's k a r p', ''])
(['OLD_NORSE', 'non', 'come', '1446', 'come', 'ˈkoma', 'come:B', 'ˈk o m a', ''], ['SWEDISH', 'swe', 'come', '1446', 'come', 'ˈkɔ̀mːa', 'come:B', 'ˈk ɔ̀ mː a', ''])
(['OLD_NORSE', 'non', 'yellow', '1424', 'yellow', 'ɡulr', 'yellow:A', 'ɡ u l r', ''], ['SWEDISH', 'swe', 'yellow', '1424', 'yellow', 'ɡʉːl', 'yellow:A', 'ɡ ʉː l', ''])
(['OLD_NORSE', 'non', 'snow', '784', 'snow', 'snjoːr', 'snow:B', 's n j oː r', ''], ['SWEDISH', 'swe', 'snow', '784', 'snow', 'snøː', 'snow:B', 's n øː', ''])
(['OLD_NORSE', 'non', 'sing', '1261', 'sing', 'ˈsynɣva', 'sing:B', 'ˈs y n ɣ v a', ''], ['SWEDISH', 'swe', 'sing', '1261', 'sing', 'ˈɧɵ̀ŋːa', 'sing:B', 'ˈɧ ɵ̀ ŋː a', ''])
(['OLD_NORSE', 'non', 'tail', '1220', 'tail', 'stertr', 'tail:B', 's t e r t r', ''], ['SWEDISH', 'swe', 'tail', '1220', 'tail', 'ɧæʈː', 'tail:B', 'ɧ æ ʈː', ''])
(['

In [7]:
# save the custom dataset to a file
def save_dataset(cognate_pair_dict, output_dir=None):
    '''
    Saves a cognate pair dict for a particular daughter language as two .tsv files, \
        one containing the parent lang's cognates, the other the daughter's.

    cognate_pair_dict: the cognate pair dictionary for the daughter language, formatted like one of the dictionaries in the output of filter_subfamily()
    output_dir: directory the files will be saved in, under /data/. Default folder name is 'ParentLang-DaughterLang Cognates'
    '''

    # we extract an arbitrary line from the dictionary to obtain the parent and daughter iso codes
    parent_line, daughter_line = next(iter(cognate_pair_dict.values())) # popping from an iterator is best way of doing this: it uses the least space since it doesn't load the whole dictionary

    if output_dir is None: # generate default directory name
        parent_language, daughter_language = parent_line[0], daughter_line[0]
        output_dir = parent_language + '-' + daughter_language + ' Cognates'
        output_dir = output_dir.title() # format to make the folder easier to read
    
    output_dir = os.path.join('data', output_dir)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    
    parent_iso, daughter_iso = parent_line[1], daughter_line[1]
    parent_file_path = os.path.join(output_dir, parent_iso + '.tsv')
    daughter_file_path = os.path.join(output_dir, daughter_iso + '.tsv')

    with open(parent_file_path, 'w') as f_p, open(daughter_file_path, 'w') as f_d:
        writer_p = csv.writer(f_p, delimiter='\t')
        writer_d = csv.writer(f_d, delimiter='\t')

        header = ['language', 'iso_code', 'gloss', 'global_id', 
            'local_id', 'transcription', 'cognate_class', 'tokens', 'notes']
        writer_p.writerow(header)
        writer_d.writerow(header)

        for k in cognate_pair_dict.keys():
            parent_line, daughter_line = cognate_pair_dict[k]
            writer_p.writerow(parent_line)
            writer_d.writerow(daughter_line)

In [8]:
file_path = 'Old_Norse-Icelandic Cognates'

save_dataset(nordic_cognate_pair_dicts['isl'], file_path)