Code to get and save the Ethnologue language phlyogen

In [63]:
import json
import re
import sys
import urllib
import time

import requests
import lxml.html
from lxml import etree

def get_family(path):
    print("getting " + path)
    r = requests.get("https://www.ethnologue.com" + path)
    html = lxml.html.fromstring(r.text, "lxml")
    return path, parse_family(html)

def parse_family(html):
    try:
        ethn_tree = html.cssselect("div.view-family.view-id-family.ethn-tree")[0]
    except IndexError as e:
        print(html)
        raise e
    divs = ethn_tree.findall("div")
    # flat tree
    if len(divs) == 1:
        return parse_family1(divs[0])
    elif len(divs) == 2:
        return parse_family2(divs)
    else:
        print("Unknown number of divs in ", path)

def parse_subgroup_text(x):
    data = dict(zip(('name', ), re.match(r"(.*) \((\d+)\)$", x).groups()))
    data["is_language"] = False
    return data
                    
def parse_lang(el):
    data = {}
    path = el.xpath('.//a[contains(@href, "/language/")]')[0].get("href")
    data["name"] = el.xpath(".//span[contains(@class, 'field-content')]/text()[1]")[0].strip()
    data["is_language"] = True
    data["id"] = el.xpath(".//a[contains(@href, '/language/')]/text()")[0][1:-1]
    country = el.xpath(".//a[contains(@href, '/country/')]")[0].get("href")
    data['country'] = re.search('/country/([A-Z]{2})', country).group(1)
    data['children'] = {}
    return (path, data)

def parse_family1(el):
    """ Parse language family with no-subgroups """
    langs = el.cssselect("li.lang-indent")
    data = parse_subgroup_text(el.cssselect("div.views-field-name-1 > span.field-content")[0].text)
    data['children'] = dict(parse_lang(x) for x in langs)
    return data

def parse_family2(divs):
    """ Parse language family with subgroups """
    data = parse_subgroup_text(divs[0].
      cssselect("div.views-field-name-1 > span.field-content")[0].text)
    item_list = divs[1].xpath("div[@class='item-list']/ul")[0]
    data['children'] = parse_item_list(item_list)
    return data

def parse_item_list(el):
    return dict(parse_item(li) for li in el.findall("li"))

def parse_item(el):
    path = el.find("a").get("href")
    data = parse_subgroup_text(el.find("a").text.strip())
    data['id'] = urllib.parse.unquote(path.split('/')[2])
    print(path)
    langs = el.xpath("div[contains(@class, 'view-id-language')]//li[contains(@class, 'lang-indent')]")
    data['children'] = {}
    if len(langs):
        lang_data = dict(parse_lang(x) for x in langs)
        print(str(len(lang_data)) + " languages")
        data['children'].update(lang_data)
    item_list = el.xpath("div[@class='item-list']")
    if len(item_list):
        subgroup_data = dict(parse_item_list(item_list[0].find("ul")))
        data['children'].update(subgroup_data)
    return (path, data)

def get_all_families():
    r = requests.get("https://www.ethnologue.com/browse/families")
    html = lxml.html.fromstring(r.text)
    families = [a.get("href") for a in html.xpath("//a[contains(@href, '/subgroups/')]")]
    out = {}
    for family in families:
        k, v = get_family(family)
        out[k] = v
        time.sleep(10)
    return out


In [64]:
data = get_all_families()

getting /subgroups/afro-asiatic
/subgroups/berber
/subgroups/eastern-3
/subgroups/awjila-sokna
2 languages
/subgroups/siwa
1 languages
/subgroups/northern-6
1 languages
/subgroups/atlas
4 languages
/subgroups/kabyle
1 languages
/subgroups/zenati
/subgroups/east-7
3 languages
/subgroups/ghomara
1 languages
/subgroups/mzab-wargla
4 languages
/subgroups/riff
2 languages
/subgroups/shawiya
1 languages
/subgroups/tidikelt
1 languages
/subgroups/tamasheq
/subgroups/northern-12
1 languages
/subgroups/southern-16
3 languages
/subgroups/zenaga
1 languages
/subgroups/chadic
/subgroups/biu-mandara
/subgroups/a
/subgroups/a1-0
/subgroups/eastern-31
3 languages
/subgroups/western-30
2 languages
/subgroups/a2-1
1 languages
/subgroups/1-2
4 languages
/subgroups/2-2
3 languages
/subgroups/a3
5 languages
/subgroups/a4
/subgroups/lamang
3 languages
/subgroups/mandara-proper
/subgroups/glavda
5 languages
/subgroups/mandara
1 languages
/subgroups/podoko
1 languages
/subgroups/a5
18 languages
/subgroups/a6

getting /subgroups/miwok-costanoan
/subgroups/costanoan
2 languages
/subgroups/miwokan
/subgroups/eastern-miwokan
1 languages
/subgroups/sierra-miwok
3 languages
/subgroups/western-miwokan
2 languages
getting /subgroups/tacanan
/subgroups/chama
2 languages
/subgroups/tacana
3 languages
getting /subgroups/arauan
/subgroups/jamamadi
1 languages
getting /subgroups/haida
getting /subgroups/mixe-zoquean
/subgroups/mixean
2 languages
/subgroups/oaxaca-mixean
3 languages
/subgroups/lowland-mixe
3 languages
/subgroups/midland-mixe
1 languages
/subgroups/south-highland-mixe
1 languages
/subgroups/zoquean
1 languages
/subgroups/chiapas-zoquean
2 languages
/subgroups/northeast-zoque
1 languages
/subgroups/gulf-zoquean
3 languages
getting /subgroups/tai-kadai
/subgroups/hlai
2 languages
/subgroups/kam-tai
/subgroups/kam-sui
12 languages
/subgroups/lakkja
2 languages
/subgroups/tai
4 languages
/subgroups/central-9
10 languages
/subgroups/northern-18
13 languages
/subgroups/southwestern
31 languages

getting /subgroups/hmong-mien
/subgroups/hmongic
/subgroups/bunu
4 languages
/subgroups/chuanqiandian
22 languages
/subgroups/pa-hng
1 languages
/subgroups/qiandong
3 languages
/subgroups/xiangxi
2 languages
/subgroups/ho-nte
1 languages
/subgroups/mienic
/subgroups/biao-jiao
1 languages
/subgroups/mian-jin
3 languages
/subgroups/zaomin
1 languages
getting /subgroups/mongol-langam
getting /subgroups/tarascan
getting /subgroups/austronesian
/subgroups/atayalic
2 languages
/subgroups/bunun
1 languages
/subgroups/east-formosan
/subgroups/central-6
2 languages
/subgroups/northern-9
2 languages
/subgroups/southwest-0
1 languages
/subgroups/malayo-polynesian
1 languages
/subgroups/bali-sasak-sumbawa
1 languages
/subgroups/sasak-sumbawa
2 languages
/subgroups/bashiic
/subgroups/ivatan
2 languages
/subgroups/yami
1 languages
/subgroups/bilic
2 languages
/subgroups/blaan
2 languages
/subgroups/tboli
1 languages
/subgroups/celebic
/subgroups/eastern-13
/subgroups/saluan-banggai
/subgroups/easter

9 languages
/subgroups/peripheral-2
/subgroups/central-papuan
/subgroups/oumic
1 languages
/subgroups/magoric
3 languages
/subgroups/sinagoro-keapara
4 languages
/subgroups/west-central-papuan
/subgroups/gabadi
1 languages
/subgroups/nuclear-2
5 languages
/subgroups/kilivila-louisiades
/subgroups/kilivila
3 languages
/subgroups/misima
1 languages
/subgroups/nimoa-sudest
2 languages
/subgroups/yapese
1 languages
/subgroups/south-halmahera-west-new-guinea
/subgroups/south-halmahera
1 languages
/subgroups/east-makian-gane
2 languages
/subgroups/southeast-0
4 languages
/subgroups/west-new-guinea
/subgroups/bomberai
2 languages
/subgroups/cenderawasih-bay
/subgroups/biakic
3 languages
/subgroups/iresim
1 languages
/subgroups/mor-0
1 languages
/subgroups/raja-ampat
10 languages
/subgroups/tandia
1 languages
/subgroups/waropen
1 languages
/subgroups/yapen-0
/subgroups/central-western
11 languages
/subgroups/east-15
2 languages
/subgroups/yaur
1 languages
/subgroups/yeretuar
1 languages
/subgr

getting /subgroups/huavean
getting /subgroups/mongolic
/subgroups/eastern-0
/subgroups/dagur
1 languages
/subgroups/mongour
5 languages
/subgroups/oirat-khalkha
/subgroups/khalkha-buriat
/subgroups/buriat
3 languages
/subgroups/mongolian-proper
2 languages
/subgroups/oirat-kalmyk-darkhat
1 languages
/subgroups/western
1 languages
getting /subgroups/tequistlatecan
getting /subgroups/aymaran
/subgroups/aymara
2 languages
/subgroups/tupe
1 languages
getting /subgroups/indo-european
/subgroups/albanian
/subgroups/gheg
1 languages
/subgroups/tosk
3 languages
/subgroups/armenian
1 languages
/subgroups/balto-slavic
/subgroups/baltic
/subgroups/eastern-18
4 languages
/subgroups/western-17
1 languages
/subgroups/slavic
/subgroups/east-4
4 languages
/subgroups/south-3
/subgroups/eastern-27
3 languages
/subgroups/western-25
5 languages
/subgroups/west-4
/subgroups/czech-slovak
2 languages
/subgroups/lechitic
3 languages
/subgroups/sorbian
2 languages
/subgroups/celtic
/subgroups/insular
/subgroup

2 languages
/subgroups/sotho-tswana-s31
1 languages
/subgroups/sotho-tswana-s311
1 languages
/subgroups/sotho-tswana-s32
3 languages
/subgroups/sotho-tswana-s33
1 languages
/subgroups/tswa-rhonga-s51
1 languages
/subgroups/tswa-rhonga-s53
1 languages
/subgroups/tswa-rhonga-s54
1 languages
/subgroups/venda-s21
1 languages
/subgroups/northwest-0
/subgroups/a-4
/subgroups/bafia-a501
1 languages
/subgroups/bafia-a51
1 languages
/subgroups/bafia-a52
1 languages
/subgroups/bafia-a53
1 languages
/subgroups/bafia-a54
1 languages
/subgroups/basaa-a41
1 languages
/subgroups/basaa-a42
1 languages
/subgroups/basaa-a43
2 languages
/subgroups/basaa-a44
1 languages
/subgroups/basaa-a45
1 languages
/subgroups/basaa-a46
1 languages
/subgroups/basaa-a461
1 languages
/subgroups/basaa-a462
1 languages
/subgroups/bubi-benga-a31
2 languages
/subgroups/bubi-benga-a32
1 languages
/subgroups/bubi-benga-a33
2 languages
/subgroups/bubi-benga-a34
1 languages
/subgroups/duala-a21
1 languages
/subgroups/duala-a22
1

2 languages
/subgroups/bete
/subgroups/eastern-35
2 languages
/subgroups/western-35
3 languages
/subgroups/dida
4 languages
/subgroups/kwadia
1 languages
/subgroups/kuwaa
1 languages
/subgroups/seme
1 languages
/subgroups/western-20
/subgroups/bassa
3 languages
/subgroups/grebo
/subgroups/glio-oubi
1 languages
/subgroups/ivorian
3 languages
/subgroups/liberian
5 languages
/subgroups/klao
2 languages
/subgroups/wee
/subgroups/guere-krahn
6 languages
/subgroups/konobo
1 languages
/subgroups/nyabwa
1 languages
/subgroups/wobe
1 languages
/subgroups/kwa
/subgroups/left-bank
/subgroups/avatime-nyangbo
3 languages
/subgroups/gbe
12 languages
/subgroups/aja
6 languages
/subgroups/fon
2 languages
/subgroups/mina
1 languages
/subgroups/kebu-animere
2 languages
/subgroups/kposo-ahlo-bowili
4 languages
/subgroups/nyo
/subgroups/agneby
3 languages
/subgroups/attie
1 languages
/subgroups/avikam-alladian
2 languages
/subgroups/ga-dangme
2 languages
/subgroups/potou-tano
/subgroups/basila-adele
2 lan

getting /subgroups/botocudoan
getting /subgroups/jicaquean
getting /subgroups/nilo-saharan
/subgroups/kuliak
/subgroups/ik
1 languages
/subgroups/ngangea-so
2 languages
/subgroups/saharan
/subgroups/eastern-6
2 languages
/subgroups/western-4
/subgroups/kanuri
6 languages
/subgroups/tebu
2 languages
/subgroups/satellite-core
/subgroups/core
/subgroups/b%E2%80%99aga
2 languages
/subgroups/eastern-sudanic
/subgroups/northern-k-languages
/subgroups/nara
1 languages
/subgroups/nubian
/subgroups/central-29
1 languages
/subgroups/birked
1 languages
/subgroups/dongolawi
1 languages
/subgroups/hill
/subgroups/kadaru-ghulfan
2 languages
/subgroups/unclassified-34
5 languages
/subgroups/northern-36
1 languages
/subgroups/western-38
1 languages
/subgroups/nyimang
2 languages
/subgroups/tama-0
/subgroups/mararit
1 languages
/subgroups/tama-sungor
2 languages
/subgroups/southern-n-languages
/subgroups/daju
/subgroups/eastern-daju
2 languages
/subgroups/western-daju
5 languages
/subgroups/eastern-jeb

getting /subgroups/turkic
/subgroups/bolgar
1 languages
/subgroups/eastern
7 languages
/subgroups/northern-1
8 languages
/subgroups/southern
4 languages
/subgroups/azerbaijani
3 languages
/subgroups/turkish
4 languages
/subgroups/turkmenian
1 languages
/subgroups/western-1
/subgroups/aralo-caspian
4 languages
/subgroups/ponto-caspian
4 languages
/subgroups/uralian
4 languages
getting /subgroups/chapacuran
/subgroups/itene
2 languages
/subgroups/wari
2 languages
getting /subgroups/katukinan
getting /subgroups/paezan
/subgroups/coconuco
6 languages
/subgroups/paezan-0
1 languages
getting /subgroups/tuu
/subgroups/ui
5 languages
/subgroups/taa
1 languages
getting /subgroups/chibchan
/subgroups/chibchan-0
2 languages
/subgroups/guaymi%C3%ADc
2 languages
/subgroups/viceitic
2 languages
/subgroups/chibchan-b
1 languages
/subgroups/eastern-chibchan
/subgroups/colombian
/subgroups/northern-colombian
1 languages
/subgroups/arhuacan
1 languages
/subgroups/southern-and-eastern-arhuacan
1 language

getting /subgroups/maiduan
/subgroups/maidu
2 languages
getting /subgroups/s%C3%A1livan
/subgroups/piaroa-maco
2 languages
getting /subgroups/yuat
getting /subgroups/dravidian
/subgroups/central
/subgroups/kolami-naiki
2 languages
/subgroups/parji-gadaba
3 languages
/subgroups/northern-0
6 languages
/subgroups/south-central
/subgroups/gondi-kui
/subgroups/gondi
11 languages
/subgroups/konda-kui
/subgroups/konda
2 languages
/subgroups/manda-kui
/subgroups/kui-kuvi
3 languages
/subgroups/manda-pengo
2 languages
/subgroups/telugu
4 languages
/subgroups/southern-1
7 languages
/subgroups/tamil-kannada
/subgroups/kannada
4 languages
/subgroups/tamil-kodagu
/subgroups/kodagu
5 languages
/subgroups/tamil-malayalam
1 languages
/subgroups/malayalam
9 languages
/subgroups/tamil
9 languages
/subgroups/toda-kota
2 languages
/subgroups/unclassified-13
1 languages
/subgroups/tulu
3 languages
/subgroups/koraga
2 languages
/subgroups/unclassified-7
4 languages
/subgroups/unclassified-2
6 languages
gett

In [65]:
import gzip
with gzip.open("ethnologue-tree.json.gz", "wt") as f:
    json.dump(data, f)

In [66]:
with gzip.open("ethnologue-tree.json.gz", "rt") as f:
    data2 = json.load(f)

In [None]:
newdata = []
def flatten_tree(node, data, parent = None, ancestors = [], depth = 0):
    out = {}
    depth += 1
    out['depth'] = depth
    out['name'] = data['name']
    out['is_language'] = data['is_language']
    try:
        out['country'] = data['country']
    except KeyError:
        pass
    try:
        out['id'] = data['id']
    except KeyError as e:
        out['id'] = node.split("/")[2]
    out['path'] = node
    out['parent'] = parent
    if parent is not None:
        out['ancestors'] = [parent] + ancestors 
    else:
        out['ancestors'] = []
    out['children'] = [k for k in data['children'].keys()]
    out['descendants'] = []
    for k, v in data['children'].items():
        out['descendants'].append(k)
        desc = flatten_tree(k, v, 
                             parent = node,
                             ancestors = out['ancestors'],
                             depth = depth)
        if not (isinstance(desc, list)):
            print(type(desc))
            raise TypeError
        for d in desc:
            print(d)
            out['descendants'].append(d)
    # add this to the external dict
    newdata.append(out)
    # return descendants
    return out['descendants']
    
for k, v in data.items():
    flatten_tree(k, v)

In [111]:
newdata[5]

{'ancestors': ['/subgroups/berber', '/subgroups/afro-asiatic'],
 'children': ['/subgroups/awjila-sokna', '/subgroups/siwa'],
 'depth': 3,
 'descendants': ['/subgroups/awjila-sokna',
  '/language/auj/20',
  '/language/swn/20',
  '/subgroups/siwa',
  '/language/siz/20'],
 'id': 'eastern-3',
 'is_language': False,
 'name': 'Eastern',
 'parent': '/subgroups/berber',
 'path': '/subgroups/eastern-3'}

In [112]:
import csv
fields = ('path', 'id', 'name', 'is_language', 'country', 'depth', 'parent', 'ancestors',
          'children', 'descendants')
with gzip.open("ethnologue.csv.gz", "wt") as f:
    writer = csv.DictWriter(f, fields)
    writer.writeheader()
    for row in newdata:
        r = row.copy()
        # print(row)
        for i in ('ancestors', 'children', 'descendants'):
            r[i] = ' '.join(row[i])
        r['is_language'] = int(row['is_language'])
        writer.writerow(r)