In [14]:
import dill
import json

import numpy as np
import scipy
import networkx as nx

from scipy.sparse import csr_matrix

In [46]:
processed_data_path = '../data/processed/'

lexicon_fn = processed_data_path + 'lexicon.json'
tes_fn = processed_data_path + 'tes.pkl'
dirtes_fn = processed_data_path + 'dirtes.json'
colex_fn = processed_data_path + 'colex.pkl'
fields_fn = processed_data_path + 'fields.pkl'

In [16]:
lexicon = json.load(open(lexicon_fn, 'r'))
tes = dill.load(open(tes_fn, 'rb'))
dirtes = json.load(open(dirtes_fn, 'r'))
colex = dill.load(open(colex_fn, 'rb'))

In [17]:
### COLEX FUNCTIONS

def print_colex(lexicon, colex):
    rows = sorted(list(set(colex.nonzero()[0])))
    for i in rows:
        cols = colex.getrow(i).nonzero()[1]
        #print(i, cols)
        if len(cols) > 1:
            print(lexicon[i], ":", [(lexicon[j], int(colex[i,j])) for j in cols])

def filter_colex(colex, min_clx_count=None, max_clx_size=None):
    clx = colex.copy()
    
    # filter minimum colex count (min nlangs that colex)
    if min_clx_count:
        clx.data[clx.data < min_clx_count] = 0
        clx.eliminate_zeros()

    # filter maximum colex array size (max nterms in a row)
    if max_clx_size:
        larger = [i for i in set(clx.nonzero()[0]) if len(clx.getrow(i).nonzero()[1]) > max_clx_size]
        tmp = clx.toarray()
        tmp[larger, :] = 0
        tmp[:, larger] = 0

        clx = csr_matrix(tmp)
        clx.eliminate_zeros()
        
    return clx

In [57]:
### LEXICAL FIELDS

def idx2lex(lexicon, selected):
    return np.array(lexicon)[selected]

def lex2idx(lexicon, selected):
    return [lexicon.index(s) for s in selected]
    
def generate_arc_fields(colex):
    return list(map(list, set(map(tuple, map(sorted, zip(clx.nonzero()[0], clx.nonzero()[1]))))))

def generate_clique_fields(colex, k):
    g = nx.Graph(colex)
    return list(map(list, nx.community.k_clique_communities(g, k)))

def generate_arclic_fields(colex, k):
    arcs = generate_arc_fields(colex)
    cliques = generate_clique_fields(colex, k)
    
    isolated = list()

    for a in arcs:
        present = False
        for c in cliques:
            if a[0] in c and a[1] in c:
                present = True
                break
        if not present:
            isolated.append(a)
    
    return cliques + isolated

def expand_field(field, colex):
    return sorted(set(sum([list(colex[i].nonzero()[1]) for i in field], [])))

def term_count(langtes, term):
    count = 0
    for te in langtes:
        for stem in langtes[te]:
            if term in langtes[te][stem]:
                count += langtes[te][stem][term]
    return count

def structure_fields(fields, tes, dirtes, lexicon):
    flds = list()
    for field in fields:
        field = idx2lex(lexicon, field)
        lf = dict()
        for lang in dirtes:
            lf[lang] = dict()
            for word in field:
                if word in dirtes[lang]:
                    for term in dirtes[lang][word]:
                        lf[lang][term] = term_count(tes[lang][1], term) 
        flds.append((field, lf))
    return flds

In [18]:
clx = filter_colex(colex, 2, 25)

In [85]:
arcs = generate_arc_fields(clx)
for a in arcs:
    print(idx2lex(lexicon, a))

['birth' 'grow']
['morning' 'next']
['path' 'walk']
['pray' 'prayer']
['field' 'work']
['bit' 'wait']
['game' 'play']
['family' 'relative']
['cut' 'head']
['brother' 'law']
['chicken' 'egg']
['curse' 'scold']
['plan' 'thought']
['brother' 'young']
['healer' 'traditional']
['bad' 'boy']
['human' 'person']
['head' 'top']
['speak' 'throw']
['eye' 'red']
['arrive' 'walk']
['harvest' 'season']
['home' 'village']
['feel' 'hear']
['begin' 'beginning']
['birth' 'child']
['rope' 'twist']
['keep' 'silent']
['farm' 'farmer']
['few' 'small']
['language' 'love']
['think' 'thought']
['follow' 'track']
['clean' 'wash']
['begin' 'talk']
['cord' 'umbilical']
['cut' 'kill']
['case' 'die']
['burst' 'egg']
['river' 'small']
['marry' 'wedding']
['cultivate' 'grow']
['earth' 'land']
['full' 'stomach']
['dig' 'remove']
['left' 'side']
['order' 'send']
['meat' 'piece']
['mother' 'sister']
['red' 'reddish']
['mother' 'wife']
['run' 'work']
['ask' 'help']
['arrive' 'return']
['hole' 'stand']
['hunger' 'hungry']

In [54]:
cliques = generate_clique_fields(clx, 3)
for c in cliques:
    print(idx2lex(lexicon, list(c)))

['sleep' 'lie' 'fall' 'asleep']
['bad' 'hear' 'voice' 'listen' 'feel']
['fruit' 'birth' 'child' 'produce' 'bear']
['beat' 'dead' 'die' 'death' 'hit' 'kill']
['begin' 'catch' 'fish' 'trap']
['brother' 'elder' 'uncle' 'big' 'grow']
['different' 'change' 'bit']
['cook' 'boil' 'food' 'prepare']
['finished' 'parent' 'boy' 'child' 'law' 'guy' 'mother' 'brother' 'son'
 'small' 'sibling' 'girl' 'grandmother' 'young' 'sister']
['branch' 'tree' 'wood']
['build' 'fire' 'prepare']
['burn' 'fire' 'candle' 'light']
['keep' 'story' 'call' 'word' 'language' 'help' 'ask' 'speak' 'talk']
['home' 'village' 'call']
['hold' 'catch' 'grab']
['world' 'country' 'land' 'earth' 'ground']
['cut' 'ground' 'fall' 'throw']
['doctor' 'traditional' 'healer']
['finish' 'end' 'story']
['farm' 'farmer' 'field']
['pity' 'feel' 'sorry']
['fight' 'fighting' 'war']
['fish' 'line' 'fishing']
['happen' 'pass' 'many']
['thought' 'keep' 'heart' 'think' 'plan']
['marry' 'marriage' 'wedding']
['marry' 'wife' 'married']
['next' 'm

In [61]:
arclics = generate_arclic_fields(clx, 3)
for ac in arclics:
    print(idx2lex(lexicon, list(ac)))

['sleep' 'lie' 'fall' 'asleep']
['bad' 'hear' 'voice' 'listen' 'feel']
['fruit' 'birth' 'child' 'produce' 'bear']
['beat' 'dead' 'die' 'death' 'hit' 'kill']
['begin' 'catch' 'fish' 'trap']
['brother' 'elder' 'uncle' 'big' 'grow']
['different' 'change' 'bit']
['cook' 'boil' 'food' 'prepare']
['finished' 'parent' 'boy' 'child' 'law' 'guy' 'mother' 'brother' 'son'
 'small' 'sibling' 'girl' 'grandmother' 'young' 'sister']
['branch' 'tree' 'wood']
['build' 'fire' 'prepare']
['burn' 'fire' 'candle' 'light']
['keep' 'story' 'call' 'word' 'language' 'help' 'ask' 'speak' 'talk']
['home' 'village' 'call']
['hold' 'catch' 'grab']
['world' 'country' 'land' 'earth' 'ground']
['cut' 'ground' 'fall' 'throw']
['doctor' 'traditional' 'healer']
['finish' 'end' 'story']
['farm' 'farmer' 'field']
['pity' 'feel' 'sorry']
['fight' 'fighting' 'war']
['fish' 'line' 'fishing']
['happen' 'pass' 'many']
['thought' 'keep' 'heart' 'think' 'plan']
['marry' 'marriage' 'wedding']
['marry' 'wife' 'married']
['next' 'm

In [62]:
fields = structure_fields(arclics, tes, dirtes, lexicon)

In [63]:
dill.dump(fields, open(fields_fn, 'wb'))