In [65]:
import newick
import yaml
import requests
import geopy

INPUTS = {'tree-glottolog': 'glottolog-tree.json', 'info': 'info.yml'}

with open(INPUTS['info'], 'r') as f:
    info = yaml.load(f)

In [66]:
import json

with open(INPUTS['tree-glottolog'], 'r') as f:
    tree_glottolog = json.load(f)

In [67]:
import zipfile
import io
import pandas as pd
import csv

r = requests.get(info['languoids_csv'])
z = zipfile.ZipFile(io.BytesIO(r.content), 'r')


In [4]:
with io.TextIOWrapper(z.open('languoid.csv', 'r')) as f:
    languoids = pd.read_csv(f)
languoids = languoids.loc[:, ('id', 'hid', 'macroarea', 'latitude', 'longitude', 'level', 'status', 'bookkeeping')]
languoids.rename(index=str, columns={"id": "glottocode", "hid": "iso_639_3"}, inplace=True)
languoids.set_index('glottocode', inplace=True)

If indexes are used, then I don't need a separate lookup data-structure

In [5]:
lang_geo = pd.read_csv(info['lang_geo'], index_col="glottocode")

In [6]:
resourcemap = requests.get(info['resourcemap']).json()

In [7]:
import re
def is_wals_lang_id(x):
    return x['type'] == "wals" and re.match("[a-z]{2,3}$", x['identifier'])
 
def is_iso_lang_id(x):
    return x['type'] == "iso639-3" and re.match("[a-z]{3}$", x['identifier'])
    
glotto2wals = {}
glotto2iso = {}
for x in resourcemap['resources']:
    for id_ in x['identifiers']:
        if is_iso_lang_id(id_):
            glotto2iso[x["id"]] = id_['identifier']
        elif is_wals_lang_id(id_):
            glotto2wals[x["id"]] = id_["identifier"]
            
# Latitude Lookup
latitude_lookup = languoids['latitude'].dropna().to_dict()
longitude_lookup = languoids['longitude'].dropna().to_dict()

Add all languoids to the data

In [37]:
newdata = dict((x['glottocode'], x) for x in languoids.reset_index().to_dict('records'))
for k, v in newdata.items():
    for j in ('longitude', 'latitude'):
        if pd.isnull(v[j]):
            v[j] = None
    for j in ('macroarea', 'iso_639_3'):
        if pd.isnull(v[j]):
            v[j] = set()
        else:
            v[j] = set((v[j],))
    try:
        v['wals_codes'] = {glotto2wals[k]}
    except KeyError:
        v['wals_codes'] = set()

In [45]:
import functools
from itertools import chain
import numpy as np

def walk_tree(x, depth=1, ancestors=[], family=None):
    glottocode = x["glottocode"]
    data = newdata[glottocode]
    if depth == 1:
        data['family'] = glottocode
    else:
        data['family'] = family
    f = functools.partial(walk_tree, depth=depth + 1,
                          family=data['family'], ancestors = ancestors + [glottocode])
    children_data = [f(child) for child in x["children"]]
    data['descendants'] = set()
    for c in children_data:
        data['descendants'].update(c['glottocodes'])
        data['wals_codes'].update(c['wals_codes'])
        data['iso_639_3'].update(c['iso_639_3'])
        data['macroarea'].update(c['macroarea'])
        
    # this isn't the right way to average over geographic points. Need to find the
    # python equivalent of geosphere
    # This is problematic and doesn't handle the 180 question
    if not data['latitude']:
        child_lat = [c['latitude'] for c in children_data 
                     if c['latitude'] and not pd.isnull(c['latitude'])]
        data['latitude'] = np.mean(child_lat) if len(child_lat) else None
    if not data['longitude']:
        child_long = [c['longitude'] for c in children_data 
                      if c['longitude'] and not pd.isnull(c['longitude'])]
        data['longitude'] = np.mean(child_long) if len(child_long) else None
    if len(children_data):
        subtree_depth = max(c['subtree_depth'] for  c in children_data) + 1
    else:
        subtree_depth = 0
    
    #geo_lookup[[glottocode]]
    data.update({
        'parent': ancestors[-1] if len(ancestors) else None,
        'children': [child['glottocode'] for child in x["children"]],
        'ancestors': ancestors,
        'depth': depth,
        'subtree_depth': subtree_depth,
        'family': family
    })

    return {
        'glottocodes': set([glottocode] + list(data['descendants'])),
        'iso_639_3': data['iso_639_3'],
        'wals_codes': data['wals_codes'],
        'macroarea': data['macroarea'],
        'latitude': data['latitude'],
        'longitude': data['longitude'],
        'subtree_depth': subtree_depth,
        'family': data['family']
    }

foo = [walk_tree(subtree) for subtree in tree_glottolog]

def topdown_fill(x):
    data = newdata[x['glottocode']]
    if data['parent']:
        parent = newdata[data['parent']]
        for i in ('wals_codes', 'iso_639_3', 'macroarea'):
            if not len(data[i]):
                data[i].update(parent[i])
        for i in ('longitude', 'latitude'):
            if not data[i]:
                data[i] = parent[i]
    for child in x['children']:
        topdown_fill(child)
    
foo = [topdown_fill(subtree) for subtree in tree_glottolog]


In [46]:
list(newdata.items())[:5]

[('aala1237',
  {'ancestors': ['aust1307',
    'nucl1752',
    'mala1545',
    'cent2237',
    'east2712',
    'ocea1241',
    'west2818',
    'meso1253',
    'newi1242',
    'stge1234',
    'kand1307',
    'ramo1244'],
   'bookkeeping': False,
   'children': [],
   'depth': 13,
   'descendants': set(),
   'family': 'aust1307',
   'glottocode': 'aala1237',
   'iso_639_3': {'rai'},
   'latitude': -4.1730599999999995,
   'level': 'dialect',
   'longitude': 152.451,
   'macroarea': set(),
   'parent': 'ramo1244',
   'status': 'safe',
   'subtree_depth': 0,
   'wals_codes': {'bnn',
    'ckh',
    'hal',
    'hoa',
    'kkt',
    'moa',
    'neh',
    'ptp',
    'rov',
    'sir',
    'sis',
    'sur',
    'taf',
    'teo',
    'tga',
    'tla'}}),
 ('aant1238',
  {'ancestors': ['nucl1709', 'kain1273', 'kain1274', 'tair1260', 'nort2920'],
   'bookkeeping': False,
   'children': [],
   'depth': 6,
   'descendants': set(),
   'family': 'nucl1709',
   'glottocode': 'aant1238',
   'iso_639_3': {

In [49]:
lang2

{'bookkeeping': True,
 'glottocode': 'aari1240',
 'iso_639_3': {'aay'},
 'latitude': None,
 'level': 'language',
 'longitude': None,
 'macroarea': set(),
 'status': 'safe',
 'wals_codes': set()}

In [69]:
import itertools
from geopy.distance import vincenty

language_dists = []
langsdata = [x for x in newdata.values() if x['level'] in ("language", "dialect") and "family" in x]
for lang1, lang2 in itertools.product(langsdata, langsdata):
    if (lang1["glottocode"] != lang2["glottocode"] and 
            lang1["family"] == lang2["family"]):
        shared = len(set(lang1['ancestors']) & set(lang2['ancestors']))
        geo = vincenty((lang1['latitude'], lang2['longitude']), (lang2['latitude'], lang2['longitude']))
        language_dists.append({'glottocode_from': lang1["glottocode"],
                               'glottocode_to': lang2["glottocode"],
                               'shared': shared,
                               'geo': geo
                              })
                               
        
        

In [62]:
len(language_dists)

4715884

In [None]:
len(newdata)