In [3]:


# Load glottolangstatus dictionary from glottolog 
# where level=language
# select id, name, status
# Use languoid.csv file.  It codes status as string of <31 characters.

import csv
from collections import Counter
from collections import defaultdict

NOMATCH = "*****"
lang_status = defaultdict(lambda: NOMATCH)

filename = "data/languoid.csv"

with open(filename) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    count_level = Counter()
    count_status = Counter()
    count_lang_status = Counter()
    for row in csv_reader:
        lang_id = row[0]
        name = row[3]
        level = row[5]
        status = row[6]
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
        else:
            count_level[level] += 1
            count_status[status] += 1
        line_count += 1

        # Retain  the 'language' level.
        if level == 'language':
            count_lang_status[status] += 1
            # lang_status[lang_id] = (name, status)
            lang_status[lang_id] = status

            
    print(f'Processed {line_count} lines.')
    print("Level:", count_level)
    print("Status:", count_status)
    print("Language Status:", count_lang_status)
        
    # lang_status has language name and status indexed by glotto id.

Column names are id, family_id, parent_id, name, bookkeeping, level, status, latitude, longitude, iso639P3code, description, markup_description, child_family_count, child_language_count, child_dialect_count, country_ids
Processed 24338 lines.
Level: Counter({'dialect': 11467, 'language': 8496, 'family': 4374})
Status: Counter({'safe': 19301, 'definitely endangered': 1800, 'vulnerable': 1447, 'extinct': 906, 'critically endangered': 460, 'severely endangered': 423})
Language Status: Counter({'safe': 3554, 'definitely endangered': 1791, 'vulnerable': 1433, 'extinct': 848, 'critically endangered': 451, 'severely endangered': 419})


In [5]:
# Service function to print column names
def print_parameters(col_names):
    for i, name in enumerate(col_names):
        print(i, name)
        
# Load WALS database from .csv file.

WALS_info = []

infilename = "data/language.csv"
outfilename = "data/wals_language_withstatus.orig.csv"
outfilenamematch = "data/wals_language_withstatus.csv"

with open(infilename) as csv_file, \
open(outfilename, mode='w') as csv_outfile, \
open(outfilenamematch, mode='w') as csv_outfilematch:
    csv_reader = csv.reader(csv_file, delimiter=',')
    csv_writer = csv.writer(csv_outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writermatch = csv.writer(csv_outfilematch, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)


    line_count = 0
    count_family = Counter()
    count_macroarea = Counter()
    glottomatch_count = 0
    glottonomatch_count = 0
    glottocodenone_count = 0
    
    for row in csv_reader:
        wals_code = row[0]
        glotto_code = row[2] if len(row[2]) > 0 else None
        family = row[7]
        macroarea = row[8]
        if line_count == 0:
            #print(f'Column names are {", ".join(row)}')
            #print_parameters(row)
            WALS_names = row
            WALS_names.insert(10, "Status from Glotto")
            print_parameters(WALS_names)
            csv_writer.writerow(WALS_names)
            csv_writermatch.writerow(WALS_names)

        else:
            count_family[family] += 1
            count_macroarea[macroarea] += 1
            WALS_row = row
            status = lang_status[glotto_code] if glotto_code != None else None
            if status == NOMATCH:
                glottonomatch_count += 1
            elif status == None:
                glottocodenone_count += 1
            else:
                glottomatch_count += 1
                    
            WALS_row.insert(10, status)
            csv_writer.writerow(WALS_row)
            if status != NOMATCH and status != None:
                csv_writermatch.writerow(WALS_row)
            #if line_count < 20:
            #    print(WALS_row[:15])

            
        line_count += 1
        
            
    print(f'Processed {line_count} lines.')
    #print("Family:", count_family)
    print("Macroarea:", count_macroarea)
    print("Glotto match:", glottomatch_count, "nomatch:", 
          glottonomatch_count, "glottonone:", glottocodenone_count)

0 wals_code
1 iso_code
2 glottocode
3 Name
4 latitude
5 longitude
6 genus
7 family
8 macroarea
9 countrycodes
10 Status from Glotto
11 1A Consonant Inventories
12 2A Vowel Quality Inventories
13 3A Consonant-Vowel Ratio
14 4A Voicing in Plosives and Fricatives
15 5A Voicing and Gaps in Plosive Systems
16 6A Uvular Consonants
17 7A Glottalized Consonants
18 8A Lateral Consonants
19 9A The Velar Nasal
20 10A Vowel Nasalization
21 11A Front Rounded Vowels
22 12A Syllable Structure
23 13A Tone
24 14A Fixed Stress Locations
25 15A Weight-Sensitive Stress
26 16A Weight Factors in Weight-Sensitive Stress Systems
27 17A Rhythm Types
28 18A Absence of Common Consonants
29 19A Presence of Uncommon Consonants
30 20A Fusion of Selected Inflectional Formatives
31 21A Exponence of Selected Inflectional Formatives
32 22A Inflectional Synthesis of the Verb
33 23A Locus of Marking in the Clause
34 24A Locus of Marking in Possessive Noun Phrases
35 25A Locus of Marking: Whole-language Typology
36 26A Pr