Code to get the Ethnologue hierarchy and turn it into distances between languages

In [226]:
import json
import re
import sys

import requests
import lxml.html
from lxml import etree

def get_family(path):
    print("getting " + path)
    r = requests.get("https://www.ethnologue.com" + path)
    html = lxml.html.fromstring(r.text, "lxml")
    return path, parse_family(html)

def parse_family(html):
    ethn_tree = html.cssselect("div.view-family.view-id-family.ethn-tree")[0]
    divs = ethn_tree.findall("div")
    # flat tree
    if len(divs) == 1:
        return parse_family1(divs[0])
    elif len(divs) == 2:
        return parse_family2(divs)
    else:
        print("Unknown number of divs in ", path)

def parse_subgroup_text(x):
    return dict(zip(('name', 'number'),
                      re.match(r"(.*) \((\d+)\)$", x).groups()))

def parse_lang(el):
    data = {}
    path = el.xpath('.//a[contains(@href, "/language/")]')[0].get("href")
    data["name"] = el.xpath(".//span[contains(@class, 'field-content')]/text()[1]")[0].strip()
    data["iso_code"] = el.xpath(".//a[contains(@href, '/language/')]/text()")[0][1:-1]
    data["country"] = {
      "path": el.xpath(".//a[contains(@href, '/country/')]")[0].get("href"),
      "name": el.xpath(".//a[contains(@href, '/country/')]/text()")[0]
    }
    return (path, data)

def parse_family1(el):
    """ Parse language family with no-subgroups """
    langs = el.cssselect("li.lang-indent")
    data = parse_subgroup_text(el.cssselect("div.views-field-name-1 > span.field-content")[0].text)
    data['languages'] = dict(parse_lang(x) for x in langs)
    return data

def parse_family2(divs):
    """ Parse language family with subgroups """
    data = parse_subgroup_text(divs[0].
      cssselect("div.views-field-name-1 > span.field-content")[0].text)
    item_list = divs[1].xpath("div[@class='item-list']/ul")[0]
    data['subgroups'] = parse_item_list(item_list)
    return data

def parse_item_list(el):
    return dict(parse_item(li) for li in el.findall("li"))

def parse_item(el):
    path = el.find("a").get("href")
    data = parse_subgroup_text(el.find("a").text.strip())
    print(path)
    langs = el.xpath("div[contains(@class, 'view-id-language')]//li[contains(@class, 'lang-indent')]")
    if len(langs):
        data['languages'] = dict(parse_lang(x) for x in langs)
        print(str(len(data['languages'])) + " languages")
    else:
        data['languages'] = []
    item_list = el.xpath("div[@class='item-list']")
    if len(item_list):
        data['subgroups'] = dict(parse_item_list(item_list[0].find("ul")))
    else:
        data['subgroups'] = []
    return (path, data)

def get_all_families():
    r = requests.get("https://www.ethnologue.com/browse/families")
    html = lxml.html.fromstring(r.text)
    families = [a.get("href") for a in html.xpath("//a[contains(@href, '/subgroups/')]")]
    return dict([get_family(x) for x in families])


The original way I parsed the data was stupid. This fixes some of it.

In [227]:
def better_tree(key, tree):
    newtree = {k: v for k, v in tree.items() if k not in ['subgroups', 'languages']}
    newtree['path'] = key
    newtree['language'] = 'iso_code' in tree
    newtree['children'] = []
    for i in ('languages', 'subgroups'):
        if i in tree and len(tree[i]):
            try:
                for k, v in tree[i].items():
                    newtree['children'] += [better_tree(k, v)]
            except AttributeError as e:
                raise e
    return newtree


In [228]:
import json
with open('../data-raw/ethnologue-tree.json', 'r') as f:
    ethnologue = json.load(f)
# ethnologue = get_all_families()

In [229]:
ethnologue_tree = [better_tree(k, v) for k, v in ethnologue.items()]

Generate paths for each ethnologue language

In [254]:
def build_paths(node, root = None):
    if node['language']:
        return [{'iso_code': node['iso_code'], 'path': []}]
    else:
        out = []
        name = node['name']
        for chld in node['children']:
            for x in build_paths(chld):
                x['path'] = [name] + x['path']
                out.append(x)
        return out
        
lang_paths = []
for family in [build_paths(x) for x in ethnologue_tree]:
    for x in family:
        lang_paths.append({'iso_code': x['iso_code'], 'path': x['path'], 'family': x['path'][0]})

In [261]:
lang_paths[5000]

{'family': 'Niger-Congo',
 'iso_code': 'gbs',
 'path': ['Niger-Congo',
  'Atlantic-Congo',
  'Volta-Congo',
  'Kwa',
  'Left Bank',
  'Gbe']}

Generate data with distances between languages in the ethnologue. This distance is directed. It is the distance to the closest shared ancestor on the tree.

In [273]:
def seqdist(x, y):
    for i in range(len(x)):
        if i >= len(y) or x[i] != y[i]:
            return len(x) - i
    return 0

lang_dists = []
jj = 0
for i, x in enumerate(lang_paths):
    for w in lang_paths:
        if (x['iso_code'] != w['iso_code']) and (x['family'] == w['family']):
            d = {'from': x['iso_code'], 'to': w['iso_code'], 'distance': seqdist(x['path'], w['path']) + 1}
            lang_dists.append(d)
lang_dists = sorted(lang_dists, key = lambda x: (x['from'], x['distance'], x['to']))

In [274]:
import csv
with open("../data-raw/ethnologue-distances.csv", 'w') as f:
    writer = csv.DictWriter(f, fieldnames=('from', 'to', 'distance'))
    writer.writeheader()
    writer.writerows(lang_dists)

In [277]:
1000 ** 2 + 150 * 40

1006000