In [1]:
URL = "http://asjp.clld.org/static/download/asjp-dataset.tab.zip"

In [2]:
import zipfile
import requests

In [3]:
r = requests.get(URL, stream=True)

In [4]:
import io
asjp_dataset = zipfile.ZipFile(io.BytesIO(r.content))

In [5]:
import sqlite3

In [6]:
DBNAME = "ASJP.db"
import os
try:
    os.remove(DBNAME)
except FileNotFoundError:
    pass

conn = sqlite3.connect(DBNAME)

c = conn.cursor()
c.execute("""
CREATE TABLE meanings (
    meaning TEXT PRIMARY KEY,
    num INTEGER UNIQUE,
    in_forty BOOLEAN NOT NULL    
)""")
c.execute("""
CREATE TABLE languages (
    language TEXT PRIMARY KEY,
    wls_fam TEXT NOT NULL,
    wls_gen TEXT NOT NULL,
    e TEXT,
    hh TEXT,
    lat NUMERIC CHECK (lat BETWEEN -90 AND 90),
    lon NUMERIC CHECK (lon BETWEEN -180 AND 180),
    pop INTEGER CHECK (pop >= 0),
    wcode CHAR(3),
    iso CHAR(3)
)""")
c.execute("""
CREATE TABLE wordlists (
    language TEXT NOT NULL,
    meaning TEXT NOT NULL,
    word TEXT NOT NULL,
    synonym_num INTEGER CHECK (synonym_num >= 1),
    loanword BOOLEAN NOT NULL,    
    PRIMARY KEY (language, meaning, word),
    FOREIGN KEY (language) REFERENCES languages (language),
    FOREIGN KEY (meaning) REFERENCES meanings (meaning)    
)""")
c.execute("""
CREATE TABLE distances (
    language_1 TEXT NOT NULL,
    language_2 TEXT NOT NULL,
    ldn NUMERIC NOT NULL CHECK (ldn BETWEEN 0 AND 1),
    ldnd NUMERIC NOT NULL CHECK (ldnd >= 0),
    common_words INTEGER NOT NULL CHECK (common_words >= 1),
    PRIMARY KEY (language_1, language_2),
    FOREIGN KEY (language_2) REFERENCES languages (language),
    FOREIGN KEY (language_1) REFERENCES languages (language)
)""")
conn.commit()

import json
with open('ASJP_meanings.json', 'r') as f:
    meanings = json.load(f)
for meaning, v in meanings.items():
    c.execute("INSERT INTO meanings VALUES (?, ?, ?)", (meaning, v['id'], v['in_forty']))
conn.commit()

import pandas as pd
with asjp_dataset.open('dataset.tab', 'r') as f:
    data = pd.read_csv(f, delimiter='\t', encoding='CP1252')
data.rename(index=str, columns={'names': 'language'}, inplace=True)
lang_variables = ('language', 'wls_fam', 'wls_gen', 'e', 'hh', 'lat', 'lon', 'pop', 'wcode', 'iso')
# use list comprehension instead of set diff in order to keep the order
word_variables = [c for c in data.columns if c not in lang_variables]

data.loc[:, lang_variables].to_sql('languages', conn, index=False, if_exists='append')

wordlists = data.loc[:, ['language'] + word_variables].melt(id_vars='language', var_name='meaning', value_name='word')
wordlists = wordlists.dropna(axis=0)

In [7]:
import re
for row in wordlists.itertuples(False):    
    if pd.notnull(row.word) and len(row.word) > 0:
        # there are some duplicate words; set removes duplicates
        words = set(re.split(r',\s*', row.word))
        for i, w in enumerate(words):
            if w.startswith('%'):
                loanword = True
                w = w[1:]
            else:
                loanword = False
            if w == '':
                continue
            c.execute("INSERT INTO wordlists VALUES (?, ?, ?, ?, ?)",
                      (row.language, row.meaning, w, i + 1, loanword))
conn.commit()

Create a dictionary to work with to create distances

In [8]:
langpairs = set(c.execute("""
SELECT a.language as language_1,
b.language as language_2
FROM languages as a
INNER JOIN languages as b
ON a.wls_fam = b.wls_fam
WHERE a.language < b.language
""").fetchall())

In [9]:
res = c.execute("""
SELECT language, wordlists.meaning, word
FROM wordlists
INNER JOIN meanings
ON meanings.meaning = wordlists.meaning
WHERE in_forty AND NOT loanword
""")

import collections
wordlist_dict = collections.defaultdict(lambda: collections.defaultdict(list))

for language, meaning, word in res:
    wordlist_dict[language][meaning].append(word)    
wordlist_dict = dict(wordlist_dict)    

Iterate over all pairs of languages; the comparison removes duplicate and self-comparisons.

- Calculate LD, LDN, and LDND for all pairs
- use Futures for multiprocessing
- save to database

In [10]:
import Levenshtein

In [11]:
def lexidists(words1, words2, min_words = 2):
    ldn_sum = 0.
    ldnd_denom = 0.
    common_words = set(words1.keys()) & set(words2.keys())
    M = len(common_words)
    if (M > min_words):
        for meaning1, meaning2 in itertools.product(common_words, common_words):
            # ignore duplicated non-equal meanings
            if meaning1 > meaning2:
                continue
            d = 0
            wcomp = list(itertools.product(words1[meaning1], words2[meaning2]))
            for w1, w2 in wcomp:
                d += Levenshtein.distance(w1, w2) / max(len(w1), len(w2))
            d /= len(wcomp)
            if meaning1 == meaning2:
                ldn_sum += d
            else:
                ldnd_denom += d
        return {'ldn': ldn_sum / M,
                # The  (M * (M - 1) / 2) / M = (M - 1) / 2
                'ldnd': 0.5 * (M - 1) * ldn_sum / ldnd_denom,
                'M': M}

def compare_langs(lang1, lang2):
    # each language is a name (str), wordlist (dict) tuple
    d = lexidists(lang1[1], lang2[1])
    # some pairs have NO overlap
    if d:
        return (lang1[0], lang2[0], d['ldn'], d['ldnd'], d['M'])


In [12]:
import itertools
n = None
wordlist_iter = ((x, y) for x, y in itertools.product(wordlist_dict.items(), wordlist_dict.items())
                 if (x[0], y[0]) in langpairs)

c.execute("DELETE FROM distances")
update_intvl = 10000
results = (compare_langs(*x) for x in wordlist_iter)
batch = []
for i, res in enumerate(results):
    #print(i)
    if res:
        batch.append(res)    
    if ((i + 1) % update_intvl == 0):    
        c.executemany("INSERT INTO distances VALUES (?, ?, ?, ?, ?)", batch)
        batch = []
        print("Processed %d" % (i + 1))
        conn.commit()    
conn.commit()

defaultdict(<class 'list'>, {'I': ['mo'], 'you': ['mo'], 'we': ['b3n'], 'one': ['win'], 'two': ['yo'], 'fish': ['jiNi'], 'dog': ['s3gan'], 'tree': ['tiya'], 'leaf': ['laN3n'], 'ear': ['lol'], 'eye': ['nu'], 'nose': ['jowol'], 'tooth': ['nuNun'], 'tongue': ['bEn'], 'drink': ['no'], 'sun': ['koko'], 'star': ['ninyami'], 'water': ['mo*E*'], 'stone': ['tEl'], 'fire': ['k3la'], 'mountain': ['y3m'], 'night': ['kum']}) defaultdict(<class 'list'>, {'person': ['ot'], 'louse': ['5ino'], 'leaf': ['ka'], 'bone': ['woho'], 'horn': ['la'], 'tongue': ['jEm'], 'knee': ['boN'], 'breast': ['bEl'], 'liver': ['sEk'], 'path': ['sehe']})
Processed 10000
Processed 20000
Processed 30000
Processed 40000
Processed 50000
Processed 60000
Processed 70000
defaultdict(<class 'list'>, {'I': ['i5C~e'], 'you': ['ayE'], 'we': ['intw$e'], 'one': ['bo', 'o'], 'two': ['ba'], 'person': ['nt~o'], 'tree': ['te'], 'blood': ['5iNa'], 'ear': ['to'], 'eye': ['yiCo'], 'tooth': ['yino'], 'hand': ['boko'], 'drink': ['5u'], 'see': ['

Processed 1530000
Processed 1540000
Processed 1550000
Processed 1560000
Processed 1570000
Processed 1580000
Processed 1590000
Processed 1600000
Processed 1610000
Processed 1620000
Processed 1630000
Processed 1640000
Processed 1650000
Processed 1660000
Processed 1670000
Processed 1680000
Processed 1690000
Processed 1700000
Processed 1710000
Processed 1720000
Processed 1730000
Processed 1740000
Processed 1750000
Processed 1760000
Processed 1770000
Processed 1780000
Processed 1790000
Processed 1800000
Processed 1810000
Processed 1820000
Processed 1830000
Processed 1840000
Processed 1850000
Processed 1860000
Processed 1870000
Processed 1880000
Processed 1890000
Processed 1900000
Processed 1910000
Processed 1920000
Processed 1930000
Processed 1940000
Processed 1950000
Processed 1960000
Processed 1970000
Processed 1980000
defaultdict(<class 'list'>, {'one': ['gbo'], 'two': ['so*'], 'blood': ['5imo'], 'bone': ['gba'], 'eye': ['gire'], 'nose': ['mala*'], 'tongue': ['mila*'], 'drink': ['na'], '

defaultdict(<class 'list'>, {'person': ['ur', 'ar'], 'eye': ['is'], 'die': ['kpa'], 'come': ['ya'], 'water': ['nca'], 'fire': ['mba'], 'new': ['mpa'], 'name': ['zin']}) defaultdict(<class 'list'>, {'two': ['pedi'], 'dog': ['tja'], 'leaf': ['tlha*re*'], 'tongue': ['leme'], 'breast': ['huba', 'cwele'], 'liver': ['bete'], 'come': ['tLaya'], 'sun': ['caci'], 'fire': ['ollo'], 'full': ['tlece']})
Processed 2100000
Processed 2110000
defaultdict(<class 'list'>, {'fish': ['e8u'], 'skin': ['iji'], 'blood': ['ese'], 'bone': ['igbegbe'], 'ear': ['endi'], 'eye': ['ewu'], 'nose': ['owu*'], 'tooth': ['e5i*'], 'tongue': ['irE*'], 'knee': ['egu'], 'hand': ['o'], 'breast': ['Emu*'], 'liver': ['Ejo'], 'water': ['o5i*']}) defaultdict(<class 'list'>, {'I': ['min'], 'you': ['Ng~E'], 'we': ['bEt'], 'one': ['moko', 'mos'], 'two': ['ol'], 'person': ['mut'], 'dog': ['mbw$a'], 'tree': ['muti'], 'leaf': ['kay'], 'hand': ['mok'], 'drink': ['nw~'], 'see': ['mon'], 'hear': ['wa'], 'die': ['fa'], 'come': ['is'], 'wa