In [69]:
import yaml
def dump_yaml(data, filename):
    Dumper = yaml.SafeDumper
    Dumper.ignore_aliases = lambda self, data: True
    with open(filename, 'w', encoding='utf8') as f:
        yaml.dump(data, f, Dumper = Dumper,
                  default_flow_style=False, allow_unicode=True)


In [107]:
def update_mappings(r):
    to_iso = "../data-raw/afrobarometer_to_iso/r%d.yml" % r
    to_wals = "../data-raw/afrobarometer_to_wals/r%d.yml" % r
    out = "../data-raw/afrobarometer-mappings/r%d.yml" % r
    with open(to_wals, 'r') as f:
        data_wals = yaml.load(f)

    with open(to_iso, 'r') as f:
        data_iso = yaml.load(f)
        
    newdata = {}
    for row in data_iso:
        try:
            key = (row['lang_id'], tuple(sorted(row['question'])))
        except KeyError as e:
            print(row)
            raise e
        row['wals_code'] = None
        newdata[key] = row   
        
    for row in data_wals:
        try:
            key = (row['lang_id'], tuple(sorted(row['question'])))
        except KeyError as e:
            print(row)
            raise e
        try:
            newdata[key]['wals_code'] = row['wals_code']
        except KeyError as e:
            print(row)
            # raise e

    newdata = sorted([x for x in newdata.values()], key = lambda x: (x['lang_id'], tuple(sorted(x['question']))))
    for row in newdata:
        row['question'] = sorted(row['question'])
        row['iso_639_3'] = sorted(row['iso_639_3'])
        if row['wals_code'] is not None:
            row['wals_code'] = sorted(row['wals_code'])
        if 'note' in row and row['note'] is not None:
            row['note'] = row['note'].strip()
    
    dump_yaml(newdata, out)

Add WALS mappings

In [7]:
import yaml
from yaml.constructor import ConstructorError

try:
    from yaml import CLoader as Loader
except ImportError:
    from yaml import Loader


def no_duplicates_constructor(loader, node, deep=False):
    """Check for duplicate keys."""

    mapping = {}
    for key_node, value_node in node.value:
        key = loader.construct_object(key_node, deep=deep)
        value = loader.construct_object(value_node, deep=deep)
        if key in mapping:
            raise ConstructorError("while constructing a mapping", node.start_mark,
                                   "found duplicate key (%s)" % key, key_node.start_mark)
        mapping[key] = value

    return loader.construct_mapping(node, deep)

yaml.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, no_duplicates_constructor)


for r in range(1, 7):
    try:
        with open("../data-raw/afrobarometer-mappings/r%d.yml" % r, 'r') as f:
            yaml.load(f)
    except ConstructorError as e:
        print(r)
        print(e)

In [154]:
import json
with open("../data-raw/ethnologue-tree.json", 'r') as f:
    ethnologue = json.load(f)

In [167]:
def better_tree(key, tree):
    newtree = {k: v for k, v in tree.items() if k not in ['subgroups', 'languages']}
    newtree['path'] = key
    newtree['language'] = 'iso_code' in tree
    newtree['children'] = []
    for i in ('languages', 'subgroups'):
        if i in tree and len(tree[i]):
            try:
                for k, v in tree[i].items():
                    newtree['children'] += [better_tree(k, v)]
            except AttributeError as e:
                raise e
    return newtree


In [168]:
ethnologue_tree = [better_tree(k, v) for k, v in ethnologue.items()]

Generate paths for each ethnologue language

In [188]:
def build_paths(node, root = None):
    if node['language']:
        return [{'lang': node['iso_code'], 'path': []}]
    else:
        out = []
        name = node['name']
        for chld in node['children']:
            for x in build_paths(chld):
                x['path'] = [name]
                out.append(x)
        return out
        
lang_paths = {}
for family in [build_paths(x) for x in ethnologue_tree]:
    for x in family:
        lang_paths[x['lang']] = {'path': x['path'], 'family': x['path'][-1]}

Generate data with distances between languages in the ethnologue. This distance is directed. It is the distance to the closest shared ancestor on the tree.

In [212]:
def seqdist(x, y):
    for i in range(len(x)):
        if i >= len(y) or x[i] != y[i]:
            return len(x) - i
    return 0

lang_dists = []
for j, x in lang_paths.items():
    for k, w in lang_paths.items():
        if j != k and x['family'] == w['family']:
            lang_dists.append({'from': j, 'to': k, 'distance': seqdist(x['path'], w['path']) + 1})
            

bfm
khw
xok
yrw
snq
wci
tru
sod
hax
kej
mkk
mii
dym
dol
dnn
nji
ggk
okk
knl
dmr
gof
ndc
gez
gbe
omb
tri
uzs
tml
zgm
mqf
nyb
pek
oml
ggd
kgo
muz
kie
abb
klp
byw
haz
xmo
aoj
dop
lbm
hke
see
gqr
kqe
cuv
djb
cdf
kzb
ano
chl
lln
ynk
xpt
xwt
obo
yxm
npy
xyk
kbd
rxd
amg
mff
dob
nsg
gir
kxz
aem
toy
drc
eto
xkj
zkr
nyd
tio
kbt
ptu
gcc
ley
ksu
rmd
abv
nsu
ivv
itr
bfu
hac
jhi
sky
mnv
crx
bff
aah
cbr
tsn
raw
csw
vam
cou
cet
luv
gww
cwt
tlj
sxm
sin
atx
mwp
yoy
koa
xog
bhx
tae
tnp
nml
tga
gan
niu
ant
vmm
dhm
mqb
ttc
blq
yww
kpv
bmm
emx
auw
huh
bee
bre
fla
byk
eke
mtp
mvi
los
mhe
jnd
unr
mpv
mue
tog
quq
mmj
vav
leq
ctn
bub
kcp
cak
kbg
jak
ltn
nnc
yro
nnj
gbg
bpw
vrt
scu
kgd
xet
cuu
myh
tqm
nrx
szp
pnr
bwh
gma
gll
svk
tpk
aey
esu
nnt
sos
btw
ktq
ilv
sys
waj
ndk
caz
ebo
ona
bku
ins
tyx
bbo
tqo
ayd
lke
llb
mbd
qxp
tjo
mhi
pib
nsq
ott
hid
kdc
ssk
acv
xar
teg
luc
sth
khv
tni
con
ymp
isi
mzw
mkz
sno
tpr
qua
chq
nev
huy
kyx
abj
tse
mkm
twm
tlk
tmb
tgi
knx
jye
tlo
nbj
gdg
ozm
tcx
tad
gsn
dmk
hed
mvk
mhj
laf


wlw
rmy
pgz
sdn
ckl
swr
slf
lja
fuc
dii
urz
gaw
xsi
ruf
kbz
beu
dhw
zyp
hap
kka
hka
alx
wep
ijc
sun
sgc
etb
rtm
knw
soe
ypp
xgu
ukl
khk
ebr
meo
itv
kkv
ikv
cuh
sau
xrb
huq
uma
tzh
mzn
ign
duz
msq
bqb
suj
gvp
zos
snb
fgr
gbu
mma
auc
mbv
pmd
lew
ktk
kyz
kyk
ole
zne
gds
jad
sxs
kqa
bha
jng
svc
kbo
gox
liz
iko
bnj
ekm
lnn
mpb
kfp
sed
mqo
agc
pog
aja
usi
tdy
wma
twe
oia
mll
xug
taq
wir
jeb
acy
kpo
dhl
atu
rsm
spb
bfc
tgx
dbj
wai
ndv
met
hvk
etz
ots
cua
osa
sep
tlc
how
kvz
tiq
lil
hsb
tak
myr
yiu
kxk
jnl
akz
kqb
pgi
trn
kqq
bhb
ylm
gad
xwl
fum
wsv
mkr
pwg
sbn
bei
mne
kfw
aix
lib
vel
efe
dde
yil
gix
mhp
pux
lml
bkd
pab
neq
akt
kcx
nuz
sts
bny
bzz
bio
enb
daq
snp
guq
ttz
gye
mrl
kzg
haa
bxe
swv
ogo
tab
ndl
bwm
pcl
mlr
kpl
bvb
rmt
lcd
zmx
usa
jku
ikp
alo
she
cpg
sib
aec
bem
agm
wuu
bot
kyo
bwt
arj
gnh
zul
xtm
gbp
ngs
bvw
plr
dox
sri
tcd
nsp
ecs
bbg
neo
seu
mar
bsi
miy
smp
xru
ijj
zpy
gaf
gal
nem
mem
brg
vmu
bgl
aae
lpa
len
rie
gpn
ind
wbp
mdb
bkf
cce
dua
otr
xwe
kco
gnr
dnu
sms
emu
anr
kwu
ddo


hme
mko
gyd
nio
bzv
nlv
cmr
dzo
avu
tsb
nux
xmp
dne
tsg
csi
lul
mep
zim
fal
hrt
tyl
frt
aak
xam
kee
khj
msm
sia
iff
gec
yit
srz
gip
nkg
aez
wmx
psc
xaj
chv
nzb
pho
cpi
wro
zpe
haf
ewo
xsq
kbs
asr
mrf
nbw
pug
lji
gtu
mdm
kvr
byi
geb
hoa
kgf
syo
tar
lcq
kmx
bic
sag
dga
wrx
mpj
zml
biq
fad
mas
bvq
koh
kyi
hop
jog
ktd
ybb
ljx
xaw
xrw
wag
ukq
bfn
wti
wed
ldh
wux
rav
ino
kow
bmb
rxw
bck
dtb
ttt
lhm
ymr
mir
pas
yey
xke
ria
wmi
zts
cho
aik
gro
sfw
mmu
aic
zpt
clw
api
sea
smh
kld
trc
dbi
mkv
ycn
aag
mya
mzq
dbg
anu
urw
chx
anm
tmo
ras
iwo
nyh
yaq
mqx
jjr
mdz
krf
min
wae
yag
moy
rgs
ssv
wya
tem
dnd
plh
rab
izh
axe
abq
hul
kit
yhl
byc
pub
zps
thm
eja
otq
qux
jas
mji
wll
bve
vrs
dme
dui
dzl
cow
xtp
asb
nkz
shl
lby
kuk
psr
mip
otd
kxl
ify
ury
jya
sej
sbs
raq
nla
wsr
itt
gsl
tuc
sse
mse
bva
duh
ndp
kmb
zgb
chc
shb
fpe
naq
dok
par
zln
vmh
ukr
bwa
xmf
bov
tyu
qxq
dbp
zlq
mze
moq
chf
afs
lax
fer
nke
soo
bfb
myb
jvd
mgz
mei
tql
sqt
msp
nle
ess
njd
brw
gax
nyi
puy
bmi
kif
bit
tif
boz
dak
sjp
brs
afk
ahp


KeyboardInterrupt: 

In [211]:
import csv
with open("../data-raw/ethnologue_dists.csv", 'w') as f:
    writer = csv.DictWriter(f, fieldnames=('from', 'to', 'distance'))
    writer.writeheader()
    writer.writerows(lang_dists)