In [None]:
# Laura Burdick (lburdick@umich.edu)
# Read in and format WALS data

In [None]:
import pandas as pd
from tqdm import tqdm_notebook

In [None]:
# SET THESE VARIABLES

# Folder location of WALS data
# Can be downloaded from https://wals.info.
# In this folder, you should have languages.csv, parameters.csv, and
# values.csv, as downloaded.
wals_path = '~/embedding-spaces/embedding_datasets/wals/wals_dataset/'

# Location to save WALS values
# Will be formatted as a csv file, with a separate column for each
# WALS value, as well as a column called "language" with the Bible
# language codes
wals_output_path = '~/embedding-spaces/multilingual_thesis/regression/data/wals_values_bible.csv'

In [None]:
# Read in WALS data
languages = pd.read_csv(wals_path+'languages.csv')
parameters = pd.read_csv(wals_path+'parameters.csv')
values = pd.read_csv(wals_path+'values.csv')

In [None]:
# All WALS IDs
parameter_ids = list(parameters['ID'])

In [None]:
# List of all languages in either Wikipedia or the Bible
all_languages = ['afr', 'aln', 'arb', 'arz', 'ayr', 'azb', 'azj', 'bba', 'ben', 'bqc', 'bul', 'cac', 'cak', 'ceb',\
                 'ces', 'che', 'cme', 'cmn', 'cnh', 'crh', 'cym', 'dan', 'deu', 'dyu', 'ell', 'eng', 'epo', 'fin',\
                 'fra', 'gub', 'guj', 'gur', 'hat', 'hmo', 'hrv', 'hui', 'hun', 'ifa', 'ifb', 'ify', 'ind', 'ita',\
                 'kac', 'kaz', 'kek', 'kjb', 'kor', 'lat', 'lit', 'lnd', 'lsi', 'mad', 'mah', 'mam', 'may', 'mdy',\
                 'mlg', 'mps', 'mri', 'mrw', 'mya', 'nhe', 'nld', 'nor', 'pis', 'plt', 'poh', 'por', 'prs', 'pxm',\
                 'qub', 'quh', 'quy', 'quz', 'qxr', 'ron', 'rug', 'rus', 'som', 'suz', 'swe', 'tat', 'tbz', 'tcw',\
                 'tgl', 'tlh', 'tpi', 'tpm', 'tur', 'tzo', 'ukr', 'vie', 'wal', 'wbm', 'xho', 'yua', 'zom', 'cat',\
                 'spa', 'est', 'fas', 'heb', 'hin', 'jpn', 'lav', 'pol', 'slk', 'slv', 'srp', 'tha','mnd']

In [None]:
# Get all WALS properties for all languages
all_data = {} #key: WALS parameter ID, values: list of WALS properties that ID for each language
for parameter_id in tqdm_notebook(parameter_ids):
    new_parameters = []
    for language in all_languages:
        lookup = values.loc[(values.Language_ID==language) & (values.Parameter_ID==parameter_id)]
        if len(lookup)==0:
            new_parameters.append('')
        else:
            new_parameters.append(list(lookup['Value'])[0])
    all_data[parameter_id] = new_parameters

In [None]:
# Save formatted results
results = pd.DataFrame(data=all_data)
current_columns = results.columns.values
new_columns = [i+': '+str(list(parameters.loc[parameters.ID==i]['Name'])[0]) for i in current_columns]
results.columns = new_columns
results['language'] = all_languages
results.to_csv(wals_output_path)