In [None]:
# Laura Burdick (lburdick@umich.edu)
# Filter WALS values and languages for final regression model

In [1]:
import pandas as pd
import copy
import pickle

In [None]:
# SET THESE VARIABLES

# Location of binary WALS path (created with Making Wals Binary.ipynb)
# Should be formatted as a csv file, with a separate column for each
# binary WALS value, as well as a column called "language" with the
# Bible language codes
binary_wals_path = '~/embedding-spaces/multilingual_thesis/regression/data/wals_bible_binary_values.csv'

# Location of output WALS labels for final regression model
# Formatted as a pickle file which contains a list of WALS values
wals_label_path = '~/embedding-spaces/multilingual_thesis/regression/data/allLanguages_wals_values.pkl'

# Location of output WALS features for each language for regression model
# For each language, formatted as a pickle file with name
# {wals_features_path}{language}.pkl, where pickle file contains a list
# of WALS feature values for that language
wals_features_path = '~/embedding-spaces/multilingual_thesis/regression/data/allLanguages_language_features_small_wals_'

In [2]:
# List of all languages in either Wikipedia or the Bible
all_languages = ['afr', 'aln', 'arb', 'arz', 'ayr', 'azb', 'azj', 'bba', 'ben', 'bqc', 'bul', 'cac', 'cak', 'ceb',\
                 'ces', 'che', 'cme', 'cmn', 'cnh', 'crh', 'cym', 'dan', 'deu', 'dyu', 'ell', 'eng', 'epo', 'fin',\
                 'fra', 'gub', 'guj', 'gur', 'hat', 'hmo', 'hrv', 'hui', 'hun', 'ifa', 'ifb', 'ify', 'ind', 'ita',\
                 'kac', 'kaz', 'kek', 'kjb', 'kor', 'lat', 'lit', 'lnd', 'lsi', 'mad', 'mah', 'mam', 'may', 'mdy',\
                 'mlg', 'mps', 'mri', 'mrw', 'mya', 'nhe', 'nld', 'nor', 'pis', 'plt', 'poh', 'por', 'prs', 'pxm',\
                 'qub', 'quh', 'quy', 'quz', 'qxr', 'ron', 'rug', 'rus', 'som', 'suz', 'swe', 'tat', 'tbz', 'tcw',\
                 'tgl', 'tlh', 'tpi', 'tpm', 'tur', 'tzo', 'ukr', 'vie', 'wal', 'wbm', 'xho', 'yua', 'zom', 'cat',\
                 'spa', 'est', 'fas', 'heb', 'hin', 'jpn', 'lav', 'pol', 'slk', 'slv', 'srp', 'tha','mnd']

In [5]:
wals = pd.read_csv(binary_wals_path)
wals = wals.drop('Unnamed: 0',axis=1)

In [6]:
# List of all WALS properties
wals_numbers = list(set([i.split(':')[0] for i in wals.columns.values][:-1]))

In [7]:
# Calculate the percentage of unknown WALS properties for each language
all_unknown = []
for language in all_languages:
    unknown = 0
    for number in wals_numbers:
        relevant_columns = [i for i in wals.columns.values[:-1] if i.split(':')[0]==number]
        null_column = [i for i in relevant_columns if i.split('__')[-1]==''][0]
        if list(wals.loc[wals.language==language][null_column])[0]==1:
            unknown += 1
    all_unknown.append(copy.copy(unknown)/len(wals_numbers))

In [8]:
# Sorted by number of unknown WALS properties per langugage
(all_unknown,languages_sorted)=zip(*sorted(zip(all_unknown,all_languages)))

In [9]:
# Only include languages that have at least 25% of all WALS properties
all_languages = [languages_sorted[i] for i in range(len(languages_sorted)) if all_unknown[i]<0.75]
print(all_languages)
print(len(all_languages)) # Should have 37 languages


In [11]:
# Read in mapping between Bible language names and Wikipedia language names
language_mapping = pd.read_csv('multilingual_corpora.csv')

In [None]:
# Get full language name for each language
long_languages = []
for language in all_languages:
    _df = language_mapping.loc[language_mapping['ISO 639-3 Code']==language]
    if len(_df) >= 1:
        print(list(_df['Language'])[0])
        long_languages.append(list(_df['Language'])[0])
    else:
        if language == 'mnd': # Some additional manual mappings
            long_languages.append('Mandarin')
        elif language == 'prs':
            long_languages.append('Persian')
        elif language == 'may':
            long_languages.append('Maybrat')
        elif language == 'hmo':
            long_languages.append('Hmong Njua')
        elif language=='cmn':
            long_languages.append('Comanche')
        elif language=='poh':
            long_languages.append('Pohnpeian')
        elif language=='ben':
            long_languages.append('Bengali')
        elif language=='che':
            long_languages.append('Cherokee')
        elif language=='lnd':
            long_languages.append('Linda')
        elif language=='mad':
            long_languages.append("Ma'di")
        else:
            print(language,'not in language mapping')

In [128]:
# Only take subset of WALS dataset for languages that we need
wals = wals.loc[wals.language.isin(all_languages)]

In [129]:
# Only consider WALS properties where at list 25% of filtered languages
# have the property, and at least 5 languages have the property
good_wals = []
good_wals_columns = []
ratios = []
for number in wals_numbers:
    relevant_columns = [i for i in wals.columns.values[:-1] if i.split(':')[0]==number]
    null_column = [i for i in relevant_columns if i.split('__')[-1]==''][0]
    relevant_columns = [i for i in relevant_columns if i != null_column]
    null_column_count = wals[null_column].sum()
    ratio = null_column_count / len(all_languages) #percent of languages that don't have this property
    ratios.append(ratio)
    if ratio > 0.75:
        continue #Greater than 75% of languages don't have this property
    
    good_columns = []
    for column in relevant_columns:
        column_count = wals[column].sum()
        if column_count >= 5: #At least 5 languages have this property
            good_columns.append(column)
            
    if len(good_columns) < 2: #There aren't two non-unknown features with greater than 5 languages
        continue
        
    good_wals.append(number)
    good_wals_columns += good_columns

In [130]:
len(good_wals)/len(ratios) #percentage of wals properties that we're keeping

0.5359116022099447

In [131]:
len(good_wals) # Should be 97

97

In [136]:
# Read in Spearman's correlations between all WALS properties
# (Includes manually grouped sets of properties)
correlations = pd.read_csv('correlations.csv')

In [139]:
correlations_mapping1 = {} #mapping from WALS property to correlation group
correlations_mapping2 = {} #mapping from correlation group to WALS property
for it,row in correlations.iterrows():
    correlations_mapping1[row.property1] = row.groupNum
    correlations_mapping1[row.property2] = row.groupNum
    if row.groupNum not in correlations_mapping2:
        correlations_mapping2[row.groupNum] = set()
    correlations_mapping2[row.groupNum].add(row.property1)
    correlations_mapping2[row.groupNum].add(row.property2)

In [151]:
# Replace correlated WALS properties with their correlation group
all_languages_wals_values = [i for i in wals.columns.values if \
                             i in good_wals_columns]
all_languages_wals_values = [i if i not in correlations_mapping1 \
                             else 'correlations_'+\
                             str(correlations_mapping1[i])\
                             for i in all_languages_wals_values ]
all_languages_wals_values = list(set(all_languages_wals_values))

In [152]:
# Final list of WALS properties to use in the regression model
print(all_languages_wals_values)
print(len(all_languages_wals_values))

['correlations_4', 'correlations_11', '54A: Distributive Numerals__No distributive numerals', '9A: The Velar Nasal__No initial velar nasal', 'correlations_3', '67A: The Future Tense__No inflectional future', '47A: Intensifiers and Reflexive Pronouns__Differentiated', '52A: Comitatives and Instrumentals__Identity', '14A: Fixed Stress Locations__Initial', '13A: Tone__No tones', '50A: Asymmetrical Case-Marking__Additive-quantitatively asymmetrical', '46A: Indefinite Pronouns__Interrogative-based', '57A: Position of Pronominal Possessive Affixes__No possessive affixes', '110A: Periphrastic Causative Constructions__Purposive but no sequential', '129A: Hand and Arm__Different', '143E: Preverbal Negative Morphemes__None', '143E: Preverbal Negative Morphemes__NegV', '92A: Position of Polar Question Particles__Initial', '106A: Reciprocal Constructions__Distinct from reflexive', 'correlations_0', '121A: Comparative Constructions__Locational', 'correlations_5', '86A: Order of Genitive and Noun__N

In [153]:
# Save final list of WALS labels for regression model
with open(wals_labels_path,'wb') as pickleFile:
    pickle.dump(all_languages_wals_values,pickleFile)

In [177]:
# For each language, generate list of binary features for regression model.
# Save list of binary features
for language in all_languages:
    language_wals = wals.loc[wals.language==language]
    if len(language_wals) < 1:
        print('ERROR: no wals information for language '+language)
        continue
    for it,row in language_wals.iterrows(): 
        language_wals_array = []
        for wals_property in all_languages_wals_values:
            
            if wals_property[:13] == 'correlations_': #handle correlation groups
                correlation_num = int(wals_property.split('_')[1])
                relevant_columns = correlations_mapping2[correlation_num]
                found = False
                for column in list(relevant_columns): #if any of the correlation categories are 1, make it 1
                    if row[column] == 1:
                        found = True
                        break
                if found: #one of the columns had a 1 value
                    language_wals_array.append(1)
                else: #none of the columns had a 1 value
                    language_wals_array.append(0)
                continue
                
            #not a correlation group
            language_wals_array.append(row[wals_property])
                    
        break #take only first row (only one row per language)
        
    # Save features for this language
    with open(wals_features_path+language+'.pkl','wb') as pickleFile:
        pickle.dump(language_wals_array,pickleFile) #WALS features

In [182]:
# Here are all the correlation groupings
for i in range(22):
    print(i,correlations_mapping2[i])
    print('\n')

0 {'4A: Voicing in Plosives and Fricatives__In both plosives and fricatives', '5A: Voicing and Gaps in Plosive Systems__None missing in /p t k b d g/'}


1 {'32A: Systems of Gender Assignment__No gender', '31A: Sex-based and Non-sex-based Gender Systems__No gender', '44A: Gender Distinctions in Independent Personal Pronouns__No gender distinctions', '30A: Number of Genders__None'}


2 {'32A: Systems of Gender Assignment__Semantic and formal', '31A: Sex-based and Non-sex-based Gender Systems__Sex-based'}


3 {'38A: Indefinite Articles__No definite or indefinite article', '37A: Definite Articles__No definite or indefinite article'}


4 {'100A: Alignment of Verbal Person Marking__Neutral', '103A: Third Person Zero of Verbal Person Marking__No person marking', '29A: Syncretism in Verbal Person/Number Marking__No subject person/number marking', '40A: Inclusive/Exclusive Distinction in Verbal Inflection__No person marking', '101A: Expression of Pronominal Subjects__Optional pronouns in subje