In [1]:
import dill
import json

import numpy as np
import scipy
import networkx as nx
import pandas as pd

from scipy.sparse import csr_matrix

In [2]:
processed_data_path = '../data/processed/'

lexicon_fn = processed_data_path + 'lexicon.json'
tes_fn = processed_data_path + 'tes.pkl'
dirtes_fn = processed_data_path + 'dirtes.json'
colex_fn = processed_data_path + 'colex.pkl'
fields_fn = processed_data_path + 'fields.pkl'
merged_tes_fn = processed_data_path + 'merged_tes.pkl'
saturation_fn = processed_data_path + 'saturation.csv'

In [3]:
lexicon = json.load(open(lexicon_fn, 'r'))
tes = dill.load(open(tes_fn, 'rb'))
dirtes = json.load(open(dirtes_fn, 'r'))
colex = dill.load(open(colex_fn, 'rb'))
merged_tes = dill.load(open(merged_tes_fn, 'rb'))

In [4]:
### COLEX FUNCTIONS

def print_colex(lexicon, colex):
    rows = sorted(list(set(colex.nonzero()[0])))
    for i in rows:
        cols = colex.getrow(i).nonzero()[1]
        #print(i, cols)
        if len(cols) > 1:
            print(lexicon[i], ":", [(lexicon[j], int(colex[i,j])) for j in cols])

def filter_colex(colex, min_clx_count=None, max_clx_size=None):
    clx = colex.copy()
    
    # filter minimum colex count (min nlangs that colex)
    if min_clx_count:
        clx.data[clx.data < min_clx_count] = 0
        clx.eliminate_zeros()

    # filter maximum colex array size (max nterms in a row)
    if max_clx_size:
        larger = [i for i in set(clx.nonzero()[0]) if len(clx.getrow(i).nonzero()[1]) > max_clx_size]
        tmp = clx.toarray()
        tmp[larger, :] = 0
        tmp[:, larger] = 0

        clx = csr_matrix(tmp)
        clx.eliminate_zeros()
        
    return clx

In [5]:
### LEXICAL FIELDS

def idx2lex(lexicon, selected):
    return np.array(lexicon)[selected]

def lex2idx(lexicon, selected):
    return [lexicon.index(s) for s in selected]
    
def generate_arc_fields(colex):
    return list(map(list, set(map(tuple, map(sorted, zip(clx.nonzero()[0], clx.nonzero()[1]))))))

def generate_clique_fields(colex, k):
    g = nx.Graph(colex)
    return list(map(list, nx.community.k_clique_communities(g, k)))

def generate_arclic_fields(colex, k):
    arcs = generate_arc_fields(colex)
    cliques = generate_clique_fields(colex, k)
    
    isolated = list()

    for a in arcs:
        present = False
        for c in cliques:
            if a[0] in c and a[1] in c:
                present = True
                break
        if not present:
            isolated.append(a)
    
    return cliques + isolated

def expand_field(field, colex):
    return sorted(set(sum([list(colex[i].nonzero()[1]) for i in field], [])))

def term_count(langtes, term):
    count = 0
    for te in langtes:
        for stem in langtes[te]:
            if term in langtes[te][stem]:
                count += langtes[te][stem][term]
    return count

def structure_fields(fields, tes, dirtes, lexicon, saturation_threshold=0.5):
    flds = list()
    for field in fields:
        field = idx2lex(lexicon, field)
        lf = dict()
        for lang in dirtes:
            lf[lang] = dict()
            # saturation selection here
            if saturation_ratio(field, tes[lang]) >= saturation_threshold:
                for word in field:
                    if word in dirtes[lang]:
                        for term in dirtes[lang][word]:
                            lf[lang][term] = term_count(tes[lang][1], term)
            # add count of under saturated fields
        flds.append((field, lf))
    return flds

def saturation_ratio(field, langtes):
    count = 0
    for term in field:
        # add cleaning of langtes keys ?
        # change to dirtes ?
        if term in langtes[0]:
            count += 1
    return count / len(field)

def generate_saturation_dataframe(lexicon, fields, tes):
    saturation_df = pd.DataFrame(columns=['lang','field','ratio'])
    for lang in tes:
        langtes = tes[lang]
        for field in fields:
            field = idx2lex(lexicon, field)
            ratio = saturation_ratio(field, langtes)
            new_row = pd.DataFrame({"lang": [lang], "field": [field], "ratio": [ratio]})
            saturation_df = pd.concat([saturation_df, new_row], ignore_index=True)
    return saturation_df

In [6]:
### PARAMETERS
MIN_CLX_COUNT = 2
MAX_CLX_SIZE = 25
SATURATION_THRESHOLD = 1.0

In [7]:
clx = filter_colex(colex, MIN_CLX_COUNT, MAX_CLX_SIZE)

In [None]:
arcs = generate_arc_fields(clx)
for a in arcs:
    print(idx2lex(lexicon, a))

In [None]:
cliques = generate_clique_fields(clx, 3)
for c in cliques:
    print(idx2lex(lexicon, list(c)))

In [8]:
arclics = generate_arclic_fields(clx, 3)
for ac in arclics:
    print(idx2lex(lexicon, list(ac)))

['birth' 'produce' 'bear' 'child']
['kill' 'beat' 'hit']
['grow' 'elder' 'big']
['prepare' 'boil' 'cook']
['young' 'guy' 'boy' 'son' 'small' 'brother' 'girl' 'sister' 'child']
['village' 'home' 'call']
['woman' 'speak' 'word' 'help' 'talk' 'keep' 'ask' 'story' 'call'
 'language']
['catch' 'grab' 'hold']
['ground' 'country' 'earth' 'land' 'world']
['cut' 'throw' 'fall']
['day' 'next' 'morning' 'tomorrow']
['dead' 'die' 'kill']
['sleep' 'fall' 'lie']
['feel' 'hear' 'listen']
['fishing' 'fish' 'line']
['wife' 'woman' 'grandmother' 'sister' 'mother' 'husband' 'marry']
['pass' 'many' 'happen']
['marry' 'wedding' 'marriage']
['plan' 'think' 'thought']
['sit' 'stay' 'settle']
['childhood' 'small']
['dear' 'friend']
['plant' 'planting']
['day' 'stop']
['face' 'side']
['animal' 'hunt']
['mind' 'thought']
['shill' 'shilling']
['curse' 'scold']
['dad' 'finished']
['hunger' 'hungry']
['bit' 'bite']
['bring' 'carry']
['keep' 'think']
['damage' 'hurt']
['ask' 'find']
['arm' 'hand']
['return' 'turn']

In [None]:
saturation_df = generate_saturation_dataframe(lexicon, arclics, tes)
saturation_df.to_csv(saturation_fn)
saturation_df

In [9]:
fields = structure_fields(arclics, merged_tes, dirtes, lexicon, SATURATION_THRESHOLD)

In [10]:
dill.dump(fields, open(fields_fn, 'wb'))