Code to get and save the Ethnologue language phlyogen

In [5]:
import json
import re
import sys
import urllib
import time
import gzip

import requests
import lxml.html
from lxml import etree

def get_family(path):
    print("getting " + path)
    r = requests.get("https://www.ethnologue.com" + path)
    html = lxml.html.fromstring(r.text, "lxml")
    return path, parse_family(html)

def parse_family(html):
    try:
        ethn_tree = html.cssselect("div.view-family.view-id-family.ethn-tree")[0]
    except IndexError as e:
        print(html)
        raise e
    divs = ethn_tree.findall("div")
    # flat tree
    if len(divs) == 1:
        return parse_family1(divs[0])
    elif len(divs) == 2:
        return parse_family2(divs)
    else:
        print("Unknown number of divs in ", path)

def parse_subgroup_text(x):
    data = dict(zip(('name', ), re.match(r"(.*) \((\d+)\)$", x).groups()))
    data["is_language"] = False
    return data
                    
def parse_lang(el):
    data = {}
    path = el.xpath('.//a[contains(@href, "/language/")]')[0].get("href")
    data["name"] = el.xpath(".//span[contains(@class, 'field-content')]/text()[1]")[0].strip()
    data["is_language"] = True
    data["id"] = el.xpath(".//a[contains(@href, '/language/')]/text()")[0][1:-1]
    country = el.xpath(".//a[contains(@href, '/country/')]")[0].get("href")
    data['country'] = re.search('/country/([A-Z]{2})', country).group(1)
    data['children'] = {}
    return (path, data)

def parse_family1(el):
    """ Parse language family with no-subgroups """
    langs = el.cssselect("li.lang-indent")
    data = parse_subgroup_text(el.cssselect("div.views-field-name-1 > span.field-content")[0].text)
    data['children'] = dict(parse_lang(x) for x in langs)
    return data

def parse_family2(divs):
    """ Parse language family with subgroups """
    data = parse_subgroup_text(divs[0].
      cssselect("div.views-field-name-1 > span.field-content")[0].text)
    item_list = divs[1].xpath("div[@class='item-list']/ul")[0]
    data['children'] = parse_item_list(item_list)
    return data

def parse_item_list(el):
    return dict(parse_item(li) for li in el.findall("li"))

def parse_item(el):
    path = el.find("a").get("href")
    data = parse_subgroup_text(el.find("a").text.strip())
    data['id'] = urllib.parse.unquote(path.split('/')[2])
    print(path)
    langs = el.xpath("div[contains(@class, 'view-id-language')]//li[contains(@class, 'lang-indent')]")
    data['children'] = {}
    if len(langs):
        lang_data = dict(parse_lang(x) for x in langs)
        print(str(len(lang_data)) + " languages")
        data['children'].update(lang_data)
    item_list = el.xpath("div[@class='item-list']")
    if len(item_list):
        subgroup_data = dict(parse_item_list(item_list[0].find("ul")))
        data['children'].update(subgroup_data)
    return (path, data)

def get_all_families():
    r = requests.get("https://www.ethnologue.com/browse/families")
    html = lxml.html.fromstring(r.text)
    families = [a.get("href") for a in html.xpath("//a[contains(@href, '/subgroups/')]")]
    out = {}
    for family in families:
        k, v = get_family(family)
        out[k] = v
        time.sleep(10)
    return out


In [64]:
data = get_all_families()

getting /subgroups/afro-asiatic
/subgroups/berber
/subgroups/eastern-3
/subgroups/awjila-sokna
2 languages
/subgroups/siwa
1 languages
/subgroups/northern-6
1 languages
/subgroups/atlas
4 languages
/subgroups/kabyle
1 languages
/subgroups/zenati
/subgroups/east-7
3 languages
/subgroups/ghomara
1 languages
/subgroups/mzab-wargla
4 languages
/subgroups/riff
2 languages
/subgroups/shawiya
1 languages
/subgroups/tidikelt
1 languages
/subgroups/tamasheq
/subgroups/northern-12
1 languages
/subgroups/southern-16
3 languages
/subgroups/zenaga
1 languages
/subgroups/chadic
/subgroups/biu-mandara
/subgroups/a
/subgroups/a1-0
/subgroups/eastern-31
3 languages
/subgroups/western-30
2 languages
/subgroups/a2-1
1 languages
/subgroups/1-2
4 languages
/subgroups/2-2
3 languages
/subgroups/a3
5 languages
/subgroups/a4
/subgroups/lamang
3 languages
/subgroups/mandara-proper
/subgroups/glavda
5 languages
/subgroups/mandara
1 languages
/subgroups/podoko
1 languages
/subgroups/a5
18 languages
/subgroups/a6

getting /subgroups/miwok-costanoan
/subgroups/costanoan
2 languages
/subgroups/miwokan
/subgroups/eastern-miwokan
1 languages
/subgroups/sierra-miwok
3 languages
/subgroups/western-miwokan
2 languages
getting /subgroups/tacanan
/subgroups/chama
2 languages
/subgroups/tacana
3 languages
getting /subgroups/arauan
/subgroups/jamamadi
1 languages
getting /subgroups/haida
getting /subgroups/mixe-zoquean
/subgroups/mixean
2 languages
/subgroups/oaxaca-mixean
3 languages
/subgroups/lowland-mixe
3 languages
/subgroups/midland-mixe
1 languages
/subgroups/south-highland-mixe
1 languages
/subgroups/zoquean
1 languages
/subgroups/chiapas-zoquean
2 languages
/subgroups/northeast-zoque
1 languages
/subgroups/gulf-zoquean
3 languages
getting /subgroups/tai-kadai
/subgroups/hlai
2 languages
/subgroups/kam-tai
/subgroups/kam-sui
12 languages
/subgroups/lakkja
2 languages
/subgroups/tai
4 languages
/subgroups/central-9
10 languages
/subgroups/northern-18
13 languages
/subgroups/southwestern
31 languages

getting /subgroups/hmong-mien
/subgroups/hmongic
/subgroups/bunu
4 languages
/subgroups/chuanqiandian
22 languages
/subgroups/pa-hng
1 languages
/subgroups/qiandong
3 languages
/subgroups/xiangxi
2 languages
/subgroups/ho-nte
1 languages
/subgroups/mienic
/subgroups/biao-jiao
1 languages
/subgroups/mian-jin
3 languages
/subgroups/zaomin
1 languages
getting /subgroups/mongol-langam
getting /subgroups/tarascan
getting /subgroups/austronesian
/subgroups/atayalic
2 languages
/subgroups/bunun
1 languages
/subgroups/east-formosan
/subgroups/central-6
2 languages
/subgroups/northern-9
2 languages
/subgroups/southwest-0
1 languages
/subgroups/malayo-polynesian
1 languages
/subgroups/bali-sasak-sumbawa
1 languages
/subgroups/sasak-sumbawa
2 languages
/subgroups/bashiic
/subgroups/ivatan
2 languages
/subgroups/yami
1 languages
/subgroups/bilic
2 languages
/subgroups/blaan
2 languages
/subgroups/tboli
1 languages
/subgroups/celebic
/subgroups/eastern-13
/subgroups/saluan-banggai
/subgroups/easter

9 languages
/subgroups/peripheral-2
/subgroups/central-papuan
/subgroups/oumic
1 languages
/subgroups/magoric
3 languages
/subgroups/sinagoro-keapara
4 languages
/subgroups/west-central-papuan
/subgroups/gabadi
1 languages
/subgroups/nuclear-2
5 languages
/subgroups/kilivila-louisiades
/subgroups/kilivila
3 languages
/subgroups/misima
1 languages
/subgroups/nimoa-sudest
2 languages
/subgroups/yapese
1 languages
/subgroups/south-halmahera-west-new-guinea
/subgroups/south-halmahera
1 languages
/subgroups/east-makian-gane
2 languages
/subgroups/southeast-0
4 languages
/subgroups/west-new-guinea
/subgroups/bomberai
2 languages
/subgroups/cenderawasih-bay
/subgroups/biakic
3 languages
/subgroups/iresim
1 languages
/subgroups/mor-0
1 languages
/subgroups/raja-ampat
10 languages
/subgroups/tandia
1 languages
/subgroups/waropen
1 languages
/subgroups/yapen-0
/subgroups/central-western
11 languages
/subgroups/east-15
2 languages
/subgroups/yaur
1 languages
/subgroups/yeretuar
1 languages
/subgr

getting /subgroups/huavean
getting /subgroups/mongolic
/subgroups/eastern-0
/subgroups/dagur
1 languages
/subgroups/mongour
5 languages
/subgroups/oirat-khalkha
/subgroups/khalkha-buriat
/subgroups/buriat
3 languages
/subgroups/mongolian-proper
2 languages
/subgroups/oirat-kalmyk-darkhat
1 languages
/subgroups/western
1 languages
getting /subgroups/tequistlatecan
getting /subgroups/aymaran
/subgroups/aymara
2 languages
/subgroups/tupe
1 languages
getting /subgroups/indo-european
/subgroups/albanian
/subgroups/gheg
1 languages
/subgroups/tosk
3 languages
/subgroups/armenian
1 languages
/subgroups/balto-slavic
/subgroups/baltic
/subgroups/eastern-18
4 languages
/subgroups/western-17
1 languages
/subgroups/slavic
/subgroups/east-4
4 languages
/subgroups/south-3
/subgroups/eastern-27
3 languages
/subgroups/western-25
5 languages
/subgroups/west-4
/subgroups/czech-slovak
2 languages
/subgroups/lechitic
3 languages
/subgroups/sorbian
2 languages
/subgroups/celtic
/subgroups/insular
/subgroup

2 languages
/subgroups/sotho-tswana-s31
1 languages
/subgroups/sotho-tswana-s311
1 languages
/subgroups/sotho-tswana-s32
3 languages
/subgroups/sotho-tswana-s33
1 languages
/subgroups/tswa-rhonga-s51
1 languages
/subgroups/tswa-rhonga-s53
1 languages
/subgroups/tswa-rhonga-s54
1 languages
/subgroups/venda-s21
1 languages
/subgroups/northwest-0
/subgroups/a-4
/subgroups/bafia-a501
1 languages
/subgroups/bafia-a51
1 languages
/subgroups/bafia-a52
1 languages
/subgroups/bafia-a53
1 languages
/subgroups/bafia-a54
1 languages
/subgroups/basaa-a41
1 languages
/subgroups/basaa-a42
1 languages
/subgroups/basaa-a43
2 languages
/subgroups/basaa-a44
1 languages
/subgroups/basaa-a45
1 languages
/subgroups/basaa-a46
1 languages
/subgroups/basaa-a461
1 languages
/subgroups/basaa-a462
1 languages
/subgroups/bubi-benga-a31
2 languages
/subgroups/bubi-benga-a32
1 languages
/subgroups/bubi-benga-a33
2 languages
/subgroups/bubi-benga-a34
1 languages
/subgroups/duala-a21
1 languages
/subgroups/duala-a22
1

2 languages
/subgroups/bete
/subgroups/eastern-35
2 languages
/subgroups/western-35
3 languages
/subgroups/dida
4 languages
/subgroups/kwadia
1 languages
/subgroups/kuwaa
1 languages
/subgroups/seme
1 languages
/subgroups/western-20
/subgroups/bassa
3 languages
/subgroups/grebo
/subgroups/glio-oubi
1 languages
/subgroups/ivorian
3 languages
/subgroups/liberian
5 languages
/subgroups/klao
2 languages
/subgroups/wee
/subgroups/guere-krahn
6 languages
/subgroups/konobo
1 languages
/subgroups/nyabwa
1 languages
/subgroups/wobe
1 languages
/subgroups/kwa
/subgroups/left-bank
/subgroups/avatime-nyangbo
3 languages
/subgroups/gbe
12 languages
/subgroups/aja
6 languages
/subgroups/fon
2 languages
/subgroups/mina
1 languages
/subgroups/kebu-animere
2 languages
/subgroups/kposo-ahlo-bowili
4 languages
/subgroups/nyo
/subgroups/agneby
3 languages
/subgroups/attie
1 languages
/subgroups/avikam-alladian
2 languages
/subgroups/ga-dangme
2 languages
/subgroups/potou-tano
/subgroups/basila-adele
2 lan

getting /subgroups/botocudoan
getting /subgroups/jicaquean
getting /subgroups/nilo-saharan
/subgroups/kuliak
/subgroups/ik
1 languages
/subgroups/ngangea-so
2 languages
/subgroups/saharan
/subgroups/eastern-6
2 languages
/subgroups/western-4
/subgroups/kanuri
6 languages
/subgroups/tebu
2 languages
/subgroups/satellite-core
/subgroups/core
/subgroups/b%E2%80%99aga
2 languages
/subgroups/eastern-sudanic
/subgroups/northern-k-languages
/subgroups/nara
1 languages
/subgroups/nubian
/subgroups/central-29
1 languages
/subgroups/birked
1 languages
/subgroups/dongolawi
1 languages
/subgroups/hill
/subgroups/kadaru-ghulfan
2 languages
/subgroups/unclassified-34
5 languages
/subgroups/northern-36
1 languages
/subgroups/western-38
1 languages
/subgroups/nyimang
2 languages
/subgroups/tama-0
/subgroups/mararit
1 languages
/subgroups/tama-sungor
2 languages
/subgroups/southern-n-languages
/subgroups/daju
/subgroups/eastern-daju
2 languages
/subgroups/western-daju
5 languages
/subgroups/eastern-jeb

getting /subgroups/turkic
/subgroups/bolgar
1 languages
/subgroups/eastern
7 languages
/subgroups/northern-1
8 languages
/subgroups/southern
4 languages
/subgroups/azerbaijani
3 languages
/subgroups/turkish
4 languages
/subgroups/turkmenian
1 languages
/subgroups/western-1
/subgroups/aralo-caspian
4 languages
/subgroups/ponto-caspian
4 languages
/subgroups/uralian
4 languages
getting /subgroups/chapacuran
/subgroups/itene
2 languages
/subgroups/wari
2 languages
getting /subgroups/katukinan
getting /subgroups/paezan
/subgroups/coconuco
6 languages
/subgroups/paezan-0
1 languages
getting /subgroups/tuu
/subgroups/ui
5 languages
/subgroups/taa
1 languages
getting /subgroups/chibchan
/subgroups/chibchan-0
2 languages
/subgroups/guaymi%C3%ADc
2 languages
/subgroups/viceitic
2 languages
/subgroups/chibchan-b
1 languages
/subgroups/eastern-chibchan
/subgroups/colombian
/subgroups/northern-colombian
1 languages
/subgroups/arhuacan
1 languages
/subgroups/southern-and-eastern-arhuacan
1 language

getting /subgroups/maiduan
/subgroups/maidu
2 languages
getting /subgroups/s%C3%A1livan
/subgroups/piaroa-maco
2 languages
getting /subgroups/yuat
getting /subgroups/dravidian
/subgroups/central
/subgroups/kolami-naiki
2 languages
/subgroups/parji-gadaba
3 languages
/subgroups/northern-0
6 languages
/subgroups/south-central
/subgroups/gondi-kui
/subgroups/gondi
11 languages
/subgroups/konda-kui
/subgroups/konda
2 languages
/subgroups/manda-kui
/subgroups/kui-kuvi
3 languages
/subgroups/manda-pengo
2 languages
/subgroups/telugu
4 languages
/subgroups/southern-1
7 languages
/subgroups/tamil-kannada
/subgroups/kannada
4 languages
/subgroups/tamil-kodagu
/subgroups/kodagu
5 languages
/subgroups/tamil-malayalam
1 languages
/subgroups/malayalam
9 languages
/subgroups/tamil
9 languages
/subgroups/toda-kota
2 languages
/subgroups/unclassified-13
1 languages
/subgroups/tulu
3 languages
/subgroups/koraga
2 languages
/subgroups/unclassified-7
4 languages
/subgroups/unclassified-2
6 languages
gett

In [65]:

with gzip.open("ethnologue-tree.json.gz", "wt") as f:
    json.dump(data, f)

In [7]:
with gzip.open("ethnologue-tree.json.gz", "rt") as f:
    data = json.load(f)

In [13]:
newdata = []
def flatten_tree(node, data, parent = None, ancestors = [], family = None, depth = 0):
    out = {}
    depth += 1

    out['depth'] = depth
    out['name'] = data['name']
    out['is_language'] = data['is_language']
    try:
        out['country'] = data['country']
    except KeyError:
        pass
    try:
        out['id'] = data['id']
    except KeyError as e:
        out['id'] = node.split("/")[2]

    out['path'] = node
    out['parent'] = parent
    if parent is not None:
        out['ancestors'] = [parent] + ancestors 
    else:
        out['ancestors'] = []
    if family is None:
        out['family'] = out['path']
    else:
        out['family'] = family        
    out['children'] = [k for k in data['children'].keys()]
    out['descendants'] = []
    for k, v in data['children'].items():
        out['descendants'].append(k)
        desc = flatten_tree(k, v, 
                             parent = node,
                             ancestors = out['ancestors'],
                             depth = depth,
                             family = out['family'])
        if not (isinstance(desc, list)):
            print(type(desc))
            raise TypeError
        for d in desc:
            print(d)
            out['descendants'].append(d)
    # add this to the external dict
    newdata.append(out)
    # return descendants
    return out['descendants']
    
for k, v in data.items():
    flatten_tree(k, v)

/language/auj/20
/language/swn/20
/language/siz/20
/subgroups/awjila-sokna
/language/auj/20
/language/swn/20
/subgroups/siwa
/language/siz/20
/language/jbe/20
/language/shi/20
/language/tzm/20
/language/zgh/20
/language/kab/20
/language/gha/20
/language/jbn/20
/language/sds/20
/language/gho/20
/language/oua/20
/language/tjo/20
/language/grr/20
/language/mzb/20
/language/sjs/20
/language/rif/20
/language/shy/20
/language/tia/20
/subgroups/east-7
/language/gha/20
/language/jbn/20
/language/sds/20
/subgroups/ghomara
/language/gho/20
/subgroups/mzab-wargla
/language/oua/20
/language/tjo/20
/language/grr/20
/language/mzb/20
/subgroups/riff
/language/sjs/20
/language/rif/20
/subgroups/shawiya
/language/shy/20
/subgroups/tidikelt
/language/tia/20
/language/cnu/20
/subgroups/atlas
/language/jbe/20
/language/shi/20
/language/tzm/20
/language/zgh/20
/subgroups/kabyle
/language/kab/20
/subgroups/zenati
/subgroups/east-7
/language/gha/20
/language/jbn/20
/language/sds/20
/subgroups/ghomara
/langua

/language/myz/20
/subgroups/western-23
/language/sam/20
/language/amw/20
/language/aao/20
/language/arq/20
/language/abv/20
/language/shu/20
/language/acy/20
/language/adf/20
/language/avl/20
/language/arz/20
/language/afb/20
/language/ayh/20
/language/acw/20
/language/yhd/20
/language/aju/20
/language/yud/20
/language/ajt/20
/language/jye/20
/language/ayl/20
/language/acm/20
/language/ary/20
/language/ars/20
/language/apc/20
/language/ayp/20
/language/acx/20
/language/ayn/20
/language/aec/20
/language/ssh/20
/language/ajp/20
/language/arb/20
/language/apd/20
/language/abh/20
/language/acq/20
/language/aeb/20
/language/auz/20
/language/mey/20
/language/mlt/20
/language/heb/20
/language/hbo/20
/language/smp/20
/subgroups/arabic
/language/aao/20
/language/arq/20
/language/abv/20
/language/shu/20
/language/acy/20
/language/adf/20
/language/avl/20
/language/arz/20
/language/afb/20
/language/ayh/20
/language/acw/20
/language/yhd/20
/language/aju/20
/language/yud/20
/language/ajt/20
/languag

/language/mnq/20
/language/mzt/20
/subgroups/tonga
/language/tnz/20
/subgroups/western-22
/language/kns/20
/language/knq/20
/subgroups/senoic
/language/lnh/20
/language/sbo/20
/language/sea/20
/language/ssm/20
/language/tea/20
/subgroups/south-aslian
/language/mhe/20
/language/szc/20
/language/sza/20
/language/tmo/20
/language/alk/20
/language/bdq/20
/language/rmx/20
/language/tpu/20
/language/cua/20
/language/kxy/20
/language/tkz/20
/language/hld/20
/language/hal/20
/language/jeh/20
/language/ren/20
/language/hre/20
/language/sed/20
/language/moo/20
/language/tdr/20
/subgroups/sedang
/language/hre/20
/language/sed/20
/subgroups/todrah-monom
/language/moo/20
/language/tdr/20
/language/tdf/20
/language/stg/20
/subgroups/duan
/language/hld/20
/subgroups/jeh-halang
/language/hal/20
/language/jeh/20
/subgroups/rengao
/language/ren/20
/subgroups/sedang-todrah
/subgroups/sedang
/language/hre/20
/language/sed/20
/subgroups/todrah-monom
/language/moo/20
/language/tdr/20
/language/xkk/20
/langu

/language/sky/20
/language/nho/20
/language/tvl/20
/subgroups/futunic
/language/aud/20
/language/mmw/20
/language/uve/20
/language/fud/20
/language/fut/20
/language/mxe/20
/language/mnv/20
/language/tkp/20
/language/piv/20
/subgroups/pukapuka
/language/pkp/20
/subgroups/samoan
/language/smo/20
/subgroups/tokelauan
/language/tkl/20
/subgroups/tongic
/language/niu/20
/language/ton/20
/subgroups/west-fijian-rotuman
/subgroups/rotuman
/language/rtm/20
/subgroups/west-fijian
/language/wyy/20
/language/bwb/20
/language/dhv/20
/language/iai/20
/language/nen/20
/language/gil/20
/language/kos/20
/language/mah/20
/language/cal/20
/language/chk/20
/language/mpy/20
/language/mrl/20
/language/nmt/20
/language/pfa/20
/language/puw/20
/language/stw/20
/language/sov/20
/language/tpv/20
/language/tox/20
/language/uli/20
/language/woe/20
/language/mkj/20
/language/pif/20
/language/pon/20
/subgroups/chuukic
/language/cal/20
/language/chk/20
/language/mpy/20
/language/mrl/20
/language/nmt/20
/language/pfa

/language/tbo/20
/language/wag/20
/language/wed/20
/language/ykk/20
/subgroups/bwaidoga
/language/bwd/20
/language/ddi/20
/language/yml/20
/language/viv/20
/language/klx/20
/language/mzz/20
/language/mox/20
/subgroups/dobu-duau
/language/bwf/20
/language/bdd/20
/language/dob/20
/language/dva/20
/language/gar/20
/language/mwa/20
/language/sew/20
/subgroups/gumawana
/language/gvs/20
/subgroups/kakabai
/language/dww/20
/language/kqf/20
/subgroups/suauic
/language/bxh/20
/language/tte/20
/language/oyy/20
/language/sbe/20
/language/swp/20
/language/unu/20
/language/wgb/20
/language/ylb/20
/language/kud/20
/language/bmn/20
/language/zgr/20
/language/yob/20
/language/oum/20
/subgroups/magoric
/language/bmn/20
/language/zgr/20
/language/yob/20
/language/hul/20
/language/khz/20
/language/meu/20
/language/snc/20
/language/kbt/20
/language/kse/20
/language/nrz/20
/language/mek/20
/language/don/20
/language/rro/20
/subgroups/gabadi
/language/kbt/20
/subgroups/nuclear-2
/language/kse/20
/language/n

/language/amq/20
/language/nul/20
/subgroups/saparua
/language/ltu/20
/language/spr/20
/subgroups/kamarian
/language/kzx/20
/subgroups/west-12
/subgroups/asilulu
/language/asl/20
/language/hik/20
/subgroups/hoamoal
/language/bzn/20
/language/alo/20
/subgroups/three-rivers
/language/jal/20
/subgroups/amalumute
/subgroups/northwest-seram
/language/lcs/20
/language/lcq/20
/subgroups/hulung
/language/huk/20
/subgroups/loun
/language/lox/20
/subgroups/ulat-inai
/language/alp/20
/language/nae/20
/subgroups/wemale
/language/weo/20
/subgroups/sawai-nuaulu
/language/nni/20
/language/nxl/20
/language/sau/20
/subgroups/sula
/language/mqc/20
/language/szn/20
/subgroups/taliabo
/language/kzd/20
/language/tlv/20
/subgroups/eastern-malayo-polynesian
/subgroups/oceanic
/subgroups/admiralty-islands
/subgroups/eastern-40
/subgroups/manus
/subgroups/east-16
/language/anx/20
/language/elu/20
/language/twp/20
/language/sbc/20
/language/kxr/20
/language/ktm/20
/language/lek/20
/language/lle/20
/language/nss

/language/kve/20
/language/sbr/20
/subgroups/tidung
/language/srk/20
/language/ntd/20
/language/itd/20
/subgroups/unclassified-27
/language/gnq/20
/language/dpp/20
/subgroups/paitanic
/language/abf/20
/language/txa/20
/subgroups/upper-kinabatangan
/language/dmg/20
/language/ruu/20
/language/low/20
/subgroups/unclassified-19
/language/txx/20
/language/alj/20
/language/iry/20
/language/tdy/20
/language/atz/20
/language/ilo/20
/language/aqn/20
/language/agy/20
/language/inn/20
/language/itb/20
/language/iti/20
/language/itt/20
/language/tis/20
/language/ity/20
/language/bjx/20
/language/kyb/20
/language/kmk/20
/language/knb/20
/language/kkg/20
/language/kmd/20
/language/ksc/20
/language/kml/20
/subgroups/itneg
/language/itb/20
/language/iti/20
/language/itt/20
/language/tis/20
/language/ity/20
/subgroups/kalinga
/language/bjx/20
/language/kyb/20
/language/kmk/20
/language/knb/20
/language/kkg/20
/language/kmd/20
/language/ksc/20
/language/kml/20
/language/blw/20
/language/lbk/20
/language

/language/ltz/20
/subgroups/east-middle-german
/language/deu/20
/language/sxu/20
/language/sli/20
/language/wym/20
/subgroups/west-middle-german
/language/pdc/20
/language/pfl/20
/language/ksh/20
/subgroups/moselle-franconian
/language/ltz/20
/language/gct/20
/language/gsw/20
/language/swg/20
/language/wae/20
/language/bar/20
/language/cim/20
/language/geh/20
/language/mhn/20
/language/vmf/20
/subgroups/alemannic
/language/gct/20
/language/gsw/20
/language/swg/20
/language/wae/20
/subgroups/bavarian-austrian
/language/bar/20
/language/cim/20
/language/geh/20
/language/mhn/20
/language/hrx/20
/subgroups/middle-german
/subgroups/east-middle-german
/language/deu/20
/language/sxu/20
/language/sli/20
/language/wym/20
/subgroups/west-middle-german
/language/pdc/20
/language/pfl/20
/language/ksh/20
/subgroups/moselle-franconian
/language/ltz/20
/subgroups/upper-german
/language/vmf/20
/subgroups/alemannic
/language/gct/20
/language/gsw/20
/language/swg/20
/language/wae/20
/subgroups/bavarian-

/language/egl/20
/language/lij/20
/language/lmo/20
/language/pms/20
/language/rgn/20
/language/vec/20
/subgroups/gallo-rhaetian
/subgroups/o%C3%AFl
/subgroups/french
/language/fra/20
/language/frc/20
/language/nrf/20
/language/pcd/20
/language/wln/20
/language/zrp/20
/subgroups/southeastern-4
/language/frp/20
/subgroups/rhaetian
/language/fur/20
/language/lld/20
/language/roh/20
/subgroups/ibero-romance
/subgroups/east-iberian
/language/cat/20
/subgroups/oc
/language/oci/20
/language/sdt/20
/subgroups/west-iberian
/subgroups/asturo-leonese
/language/ast/20
/language/mwl/20
/subgroups/castilian
/language/ext/20
/language/lad/20
/language/spa/20
/language/spq/20
/subgroups/portuguese-galician
/language/fax/20
/language/glg/20
/language/drc/20
/language/por/20
/subgroups/pyrenean-mozarabic
/subgroups/pyrenean
/language/arg/20
/subgroups/southern-12
/subgroups/corsican
/language/cos/20
/subgroups/sardinian
/language/sro/20
/language/sdn/20
/language/src/20
/language/sdc/20
/language/kwi/20

/subgroups/kikongo-h16
/language/kwy/20
/language/kng/20
/language/ldi/20
/language/yom/20
/subgroups/kimbundu-h21
/language/kmb/20
/subgroups/kimbundu-h22
/language/smd/20
/subgroups/kimbundu-h23
/language/blv/20
/subgroups/kimbundu-h24
/language/nsx/20
/subgroups/mbala-hunganna-h41
/language/mdp/20
/subgroups/mbala-hunganna-h42
/language/hum/20
/subgroups/yaka-h31
/language/lnz/20
/language/noq/20
/language/ppp/20
/language/yaf/20
/subgroups/yaka-h32
/language/sub/20
/subgroups/yaka-h321
/language/shc/20
/subgroups/yaka-h34
/language/mxg/20
/subgroups/j
/subgroups/haya-jita-e21
/language/now/20
/subgroups/haya-jita-e22
/language/hay/20
/subgroups/haya-jita-e23
/language/zin/20
/subgroups/haya-jita-e24
/language/ked/20
/subgroups/haya-jita-e25
/language/jit/20
/subgroups/haya-jita-e251
/language/kya/20
/subgroups/haya-jita-e252
/language/reg/20
/subgroups/konzo-ndandi-d41
/language/koo/20
/subgroups/konzo-ndandi-d42
/language/nnb/20
/subgroups/logooli-kuria-e401
/language/ngq/20
/subg

/language/bxs/20
/subgroups/menchum
/language/bby/20
/subgroups/narrow-grassfields
/language/fum/20
/subgroups/mbam-nkam
/subgroups/bamileke
/language/fmp/20
/language/bbj/20
/language/bko/20
/language/xmg/20
/language/nnz/20
/language/nnh/20
/language/jgo/20
/language/nla/20
/language/nwe/20
/language/ybb/20
/subgroups/ngemba
/language/azo/20
/language/bfd/20
/language/baw/20
/language/bqt/20
/language/bfp/20
/language/koc/20
/language/mfd/20
/language/nge/20
/language/pny/20
/subgroups/nkambe
/language/add/20
/language/kdz/20
/language/lmp/20
/language/mtk/20
/language/nfu/20
/language/ncp/20
/language/yam/20
/subgroups/nun-0
/language/bbw/20
/language/bfj/20
/language/bbq/20
/language/bmo/20
/language/bce/20
/language/bax/20
/language/bgj/20
/language/byv/20
/language/mhk/20
/subgroups/momo
/language/mea/20
/language/mgo/20
/language/mnf/20
/language/nbv/20
/language/ngj/20
/language/nsh/20
/language/ngn/20
/language/njj/20
/subgroups/ring
/subgroups/center
/language/bbk/20
/languag

/language/alf/20
/language/bkv/20
/language/btt/20
/language/bky/20
/language/byp/20
/language/bzy/20
/language/uba/20
/language/ukp/20
/language/afe/20
/language/abn/20
/language/odu/20
/language/kes/20
/language/mgj/20
/language/obu/20
/language/ogb/20
/language/ogg/20
/language/ogu/20
/language/xoc/20
/subgroups/abua-odual
/language/abn/20
/language/odu/20
/subgroups/kugbo
/language/kes/20
/language/ebg/20
/language/efa/20
/language/anw/20
/language/efi/20
/language/ibb/20
/language/ukq/20
/language/eke/20
/language/etb/20
/language/enw/20
/language/uda/20
/language/ibn/20
/language/ibr/20
/language/itw/20
/language/itm/20
/language/nkz/20
/language/iki/20
/language/ilv/20
/language/okb/20
/language/orx/20
/language/usk/20
/language/eki/20
/language/ide/20
/language/ann/20
/subgroups/ebughu
/language/ebg/20
/subgroups/efai
/language/efa/20
/subgroups/efik
/language/anw/20
/language/efi/20
/language/ibb/20
/language/ukq/20
/subgroups/ekit
/language/eke/20
/language/etb/20
/subgroups/

/language/nqg/20
/language/ulb/20
/language/yor/20
/subgroups/igala
/language/igl/20
/subgroups/edoid
/subgroups/delta
/language/deg/20
/language/enn/20
/language/epi/20
/subgroups/north-central-0
/language/ihi/20
/subgroups/edo-esan-ora
/language/bin/20
/language/ema/20
/language/ish/20
/subgroups/ghotuo-uneme-yekhee
/language/env/20
/language/aaa/20
/language/igw/20
/language/ikp/20
/language/atg/20
/language/oso/20
/language/sxs/20
/language/une/20
/language/ets/20
/subgroups/northwestern-3
/language/adu/20
/subgroups/osse
/language/ehu/20
/language/iya/20
/language/uha/20
/language/uku/20
/subgroups/southern-35
/language/ayk/20
/language/ids/20
/language/opa/20
/language/okx/20
/language/olm/20
/subgroups/southwestern-2
/language/erh/20
/language/iso/20
/language/oke/20
/language/urh/20
/language/evh/20
/subgroups/idomoid
/subgroups/akweya
/subgroups/eloyi
/language/afo/20
/subgroups/etulo-idoma
/subgroups/etulo
/language/utr/20
/subgroups/idoma
/language/agc/20
/language/ala/20
/l

/language/kus/20
/language/maw/20
/subgroups/nootre
/language/bly/20
/subgroups/northwest-2
/language/gur/20
/language/mos/20
/language/saf/20
/language/wlx/20
/subgroups/dagaari-birifor
/subgroups/birifor
/language/bfo/20
/language/biv/20
/subgroups/dagaari
/language/dga/20
/language/dgd/20
/language/dgi/20
/subgroups/southeast-2
/language/dag/20
/language/hag/20
/language/jmr/20
/language/xkt/20
/language/kus/20
/language/maw/20
/language/nmz/20
/language/pil/20
/subgroups/buli-koma
/language/bwu/20
/language/kma/20
/subgroups/eastern-43
/language/beh/20
/language/tbz/20
/language/mql/20
/language/wwa/20
/subgroups/gurma
/language/gux/20
/language/xon/20
/language/soy/20
/language/ntm/20
/language/gng/20
/subgroups/moba
/language/bim/20
/language/mfq/20
/subgroups/ntcham
/language/aks/20
/language/bud/20
/subgroups/western-42
/subgroups/nootre
/language/bly/20
/subgroups/northwest-2
/language/gur/20
/language/mos/20
/language/saf/20
/language/wlx/20
/subgroups/dagaari-birifor
/subgro

/language/kef/20
/language/wud/20
/subgroups/aja
/language/ajg/20
/language/ayb/20
/language/gbh/20
/language/tfi/20
/language/wem/20
/language/guw/20
/subgroups/fon
/language/fon/20
/language/mxl/20
/subgroups/mina
/language/gej/20
/subgroups/kebu-animere
/language/keu/20
/language/anf/20
/subgroups/kposo-ahlo-bowili
/language/adq/20
/language/ahl/20
/language/kpo/20
/language/bov/20
/subgroups/nyo
/subgroups/agneby
/language/aba/20
/language/abi/20
/language/adj/20
/subgroups/attie
/language/ati/20
/subgroups/avikam-alladian
/language/ald/20
/language/avi/20
/subgroups/ga-dangme
/language/ada/20
/language/gaa/20
/subgroups/potou-tano
/subgroups/basila-adele
/language/ade/20
/language/blo/20
/subgroups/ega
/language/ega/20
/subgroups/lelemi
/subgroups/lelemi-akpafu
/language/lef/20
/language/akp/20
/subgroups/likpe-santrokofi
/language/lip/20
/language/snw/20
/subgroups/logba
/language/lgq/20
/subgroups/potou
/language/ebr/20
/language/gwa/20
/subgroups/tano
/subgroups/central-31
/sub

/language/ldk/20
/language/ldo/20
/language/gmd/20
/language/pbl/20
/language/mko/20
/language/gwg/20
/language/thy/20
/subgroups/longuda
/language/lnu/20
/subgroups/waja
/subgroups/awak
/language/awo/20
/language/kcq/20
/subgroups/cham-mona
/language/cfa/20
/language/ldp/20
/subgroups/dadiya
/language/dbd/20
/subgroups/tula
/language/bsj/20
/language/tul/20
/language/wja/20
/subgroups/yungur
/subgroups/libo
/language/ldl/20
/subgroups/mboi
/language/moi/20
/subgroups/yungur-roba
/language/yun/20
/language/lla/20
/language/vor/20
/subgroups/ubangi
/subgroups/banda
/subgroups/central-32
/subgroups/central-core
/subgroups/banda-bambari
/language/liy/20
/subgroups/banda-banda
/language/bpd/20
/subgroups/banda-mbres
/language/bqk/20
/subgroups/banda-ndele
/language/bfl/20
/subgroups/mid-southern
/language/bjo/20
/language/gox/20
/language/kuw/20
/language/mnh/20
/language/nue/20
/subgroups/togbo-vara
/language/tor/20
/subgroups/western-44
/language/yaj/20
/subgroups/south-central-1
/langua

/language/hmu/20
/language/klz/20
/language/kpu/20
/language/woi/20
/language/kyo/20
/language/kvd/20
/subgroups/pantar
/language/beu/20
/language/jka/20
/language/nec/20
/language/lev/20
/language/ret/20
/language/twe/20
/language/twg/20
/language/tpg/20
/language/swt/20
/language/adb/20
/language/bfn/20
/language/ddg/20
/language/mjb/20
/language/mkz/20
/language/oia/20
/language/kvw/20
/subgroups/alor-pantar
/subgroups/alor
/language/abz/20
/language/adn/20
/language/hmu/20
/language/klz/20
/language/kpu/20
/language/woi/20
/language/kyo/20
/language/kvd/20
/subgroups/pantar
/language/beu/20
/language/jka/20
/language/nec/20
/language/lev/20
/language/ret/20
/language/twe/20
/language/twg/20
/subgroups/tanglapui
/language/tpg/20
/language/swt/20
/subgroups/timor
/language/adb/20
/language/bfn/20
/language/ddg/20
/language/mjb/20
/language/mkz/20
/language/kgv/20
/language/bdw/20
/language/ihp/20
/subgroups/karas
/language/kgv/20
/subgroups/west-bomberai-proper
/language/bdw/20
/lang

/language/tpj/20
/language/pta/20
/subgroups/bolivian-guaran%C3%AD
/language/gui/20
/language/gnw/20
/language/guq/20
/language/xet/20
/subgroups/guaran%C3%AD-0
/language/nhd/20
/language/gun/20
/language/gug/20
/language/kgk/20
/language/tpj/20
/language/pta/20
/subgroups/bolivian-guaran%C3%AD
/language/gui/20
/language/gnw/20
/language/jor/20
/language/srq/20
/language/yuq/20
/language/gyr/20
/language/psm/20
/subgroups/sirion%C3%B3
/language/jor/20
/language/srq/20
/language/yuq/20
/language/adw/20
/language/jua/20
/language/kuq/20
/language/xmo/20
/language/paf/20
/language/pah/20
/language/tkf/20
/language/wir/20
/language/api/20
/language/kgm/20
/language/urz/20
/language/urp/20
/subgroups/parintintin
/language/adw/20
/language/jua/20
/language/kuq/20
/language/xmo/20
/language/paf/20
/language/pah/20
/language/tkf/20
/language/wir/20
/language/asn/20
/language/kyz/20
/language/asu/20
/language/pak/20
/language/mdz/20
/language/gub/20
/language/tqb/20
/language/avv/20
/language/t

/language/kwj/20
/language/kmo/20
/language/sim/20
/language/kmn/20
/language/xrw/20
/language/bye/20
/language/amp/20
/language/knr/20
/language/bjh/20
/language/bit/20
/language/bnw/20
/language/dju/20
/language/mbx/20
/language/siv/20
/language/wtk/20
/language/ppe/20
/language/seo/20
/language/bic/20
/language/ham/20
/language/gbe/20
/language/pin/20
/language/sny/20
/subgroups/alamblak
/language/amp/20
/language/knr/20
/subgroups/bahinemo
/language/bjh/20
/language/bit/20
/language/bnw/20
/language/dju/20
/language/mbx/20
/language/siv/20
/language/wtk/20
/subgroups/papi
/language/ppe/20
/language/seo/20
/subgroups/sanio
/language/bic/20
/language/ham/20
/language/gbe/20
/language/pin/20
/language/sny/20
/language/ayq/20
/language/ywa/20
/language/nux/20
/language/lgt/20
/language/psq/20
/language/yss/20
/language/cjn/20
/language/wog/20
/language/akq/20
/language/aww/20
/language/nnm/20
/language/yra/20
/language/yuk/20
/language/bvz/20
/language/dei/20
/language/ckr/20
/language

In [14]:
newdata[5]

{'ancestors': ['/subgroups/berber', '/subgroups/afro-asiatic'],
 'children': ['/subgroups/awjila-sokna', '/subgroups/siwa'],
 'depth': 3,
 'descendants': ['/subgroups/awjila-sokna',
  '/language/auj/20',
  '/language/swn/20',
  '/subgroups/siwa',
  '/language/siz/20'],
 'family': '/subgroups/afro-asiatic',
 'id': 'eastern-3',
 'is_language': False,
 'name': 'Eastern',
 'parent': '/subgroups/berber',
 'path': '/subgroups/eastern-3'}

In [15]:
import csv
fields = ('path', 'id', 'name', 'is_language', 'country', 'depth', 'family', 'parent', 'ancestors',
          'children', 'descendants')
with gzip.open("ethnologue.csv.gz", "wt") as f:
    writer = csv.DictWriter(f, fields)
    writer.writeheader()
    for row in newdata:
        r = row.copy()
        # print(row)
        for i in ('ancestors', 'children', 'descendants'):
            r[i] = ' '.join(row[i])
        r['is_language'] = int(row['is_language'])
        writer.writerow(r)