In [1]:
!python -m spacy download en_core_web_sm
import dill
import json

import string
import spacy
import hunspell

import numpy as np
import pandas as pd
import scipy

import seaborn as sns

from scipy.sparse import csr_matrix

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m41.5 MB/s[0m  [33m0:00:00[0m.8 MB/s[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
spc = spacy.load('en_core_web_sm')
hsl = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')

In [3]:
processed_data_path = '../data/processed/'

corpus_fn = processed_data_path + 'corpus.pkl'
tes_fn = processed_data_path + 'tes.pkl'
lexicon_fn = processed_data_path + 'lexicon.json'
dirtes_fn = processed_data_path + 'dirtes.json'
colex_fn = processed_data_path + 'colex.pkl'
lgclx_fn = processed_data_path + 'lgclx.pkl'

In [4]:
SELECTED_POS = {'NOUN', 'VERB', 'ADJ'}

corpus = dill.load(open(corpus_fn, 'rb'))
tes = dill.load(open(tes_fn, 'rb'))

In [5]:
### SELECTING & CLEANING LEMMAS

def clean(text):
    text = text.lower()
    text = text.encode("ascii", "ignore").decode("ascii")
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text
    
def select(entry):
    if len(entry) <= 1:
        #print("remove:", entry)
        return None

    # cleaning
    entry = clean(entry)

    if not entry.isalpha():
        return None

    # for now: just remove typos, revisit later
    if not hsl.spell(entry):
        return None

    # for now: remove other pos, revisit later
    try:   
        if spc(entry)[0].pos_ not in SELECTED_POS:
            print("wrong pos:", lemma)
            return None
    except:
        return None
        
    return entry

In [6]:
### EXTRACTING LEXICON

def extract_language_lexicon(langcorp):    
    lexicon = set()
    
    for document in langcorp:
        for sentence in langcorp[document]:
            for word in langcorp[document][sentence]['spc']:
                if word['pos'] in SELECTED_POS:
                    lexicon.add(select(word['lemma']))

    return sorted(lexicon - {None})

def extract_lexicon(corpus):    
    return sorted(set(sum([extract_language_lexicon(corpus[lang]) for lang in corpus], [])))

In [7]:
### EXTRACTING DIRECT TES
# add counts ?

def extract_language_dirtes(langtes, min_stem_freq=1):    
    dirtes = dict()
    ltes = langtes[1]

    for te in ltes:
        lemma = select(te)
        if lemma:
            eqs = ltes[te]
            terms = set(sum([list(map(str, eqs[e].keys())) for e in eqs if sum(eqs[e].values()) >= min_stem_freq], []))
            
            if lemma in dirtes:
                dirtes[lemma] = sorted(set(dirtes[lemma]) | terms)
            else:
                dirtes[lemma] = sorted(terms)
                
    return dirtes

def extract_dirtes(tes, min_stem_freq=1):
    return {lang: extract_language_dirtes(tes[lang], min_stem_freq) for lang in tes}

In [57]:
### COMPUTING COLEXIFICATION

def unique_targets(dt):
    return sorted({target for k in dt for target in dt[k]})

def compute_language_colex(lexicon, langdirtes, langtes, synlex_threshold=0.5):
    dt = langdirtes
    targets = unique_targets(dt)
    #print(len(targets))
    
    row = []
    col = []
    data = []
    
    for lemma in dt:
        for target in dt[lemma]:
            if lemma not in lexicon:
                #print("NOT IN LEXICON:", lemma)
                continue
            i = lexicon.index(lemma)
            j = targets.index(target)
            row.append(i)
            col.append(j)
            data.append(1)
    
    mx = csr_matrix((data, (row, col)), shape=(len(lexicon), len(targets)))
    
    colex = mx * mx.transpose()
    colex = (colex / colex.max()).ceil()
    colex.setdiag(0)
    colex.eliminate_zeros()
    
    clx = remove_synlex(colex, lexicon, langtes, synlex_threshold)
    print(colex.nnz, clx.nnz)
    
    return clx

def compute_colex(lexicon, dirtes, tes, synlex_threshold=0.5):
    return np.sum([compute_language_colex(lexicon, dirtes[lang], tes[lang], synlex_threshold) for lang in dirtes])

def print_colex(lexicon, colex):
    rows = sorted(set(colex.nonzero()[0]))
    for i in rows:
        cols = colex.getrow(i).nonzero()[1]
        if len(cols) > 0:
            print(lexicon[i], ":", [(lexicon[j], int(colex[i,j])) for j in cols])

def filter_colex(colex, min_clx_count=None, max_clx_size=None):
    clx = colex.copy()
    
    # filter minimum colex count (min nlangs that colex)
    if min_clx_count:
        clx.data[clx.data < min_clx_count] = 0
        clx.eliminate_zeros()

    # filter maximum colex array size (max nterms in a row)
    if max_clx_size:
        larger = [i for i in set(clx.nonzero()[0]) if len(clx.getrow(i).nonzero()[1]) > max_clx_size]
        tmp = clx.toarray()
        tmp[larger, :] = 0
        tmp[:, larger] = 0

        clx = csr_matrix(tmp)
        clx.eliminate_zeros()
        
    return clx

def synlex_ratio(w1, w2, langtes):
    A = {lines for stem in langtes[0][w1] for lines in langtes[0][w1][stem]}
    B = {lines for stem in langtes[0][w2] for lines in langtes[0][w2][stem]}

    return len(A.intersection(B)) / min(len(A), len(B))
                
def remove_synlex(colex, lexicon, langtes, synlex_threshold=0.5):
    tmp = colex.toarray()
    
    count = 0
    total = 0
    rows = sorted(set(colex.nonzero()[0]))
    for i in rows:
        cols = colex.getrow(i).nonzero()[1]
        if len(cols) > 0:
            for j in cols:
                if synlex_ratio(lexicon[i], lexicon[j], langtes) > synlex_threshold:
                    tmp[i,j] = 0
                    count += 1
                total += 1
    print("removed", count, "synlex from", total)
    
    clx = csr_matrix(tmp)
    clx.eliminate_zeros()
    
    return clx

def generate_synlex_dataframe(lexicon, lgclx, tes):
    synlex_df = pd.DataFrame(columns=['lang','w1','w2','ratio'])
    for lang in tes:
        langtes = tes[lang]
        colex = lgclx[lang]
        
        rows = sorted(set(colex.nonzero()[0]))
        for i in rows:
            cols = colex.getrow(i).nonzero()[1]
            if len(cols) > 0:
                for j in cols:
                    ratio = synlex_ratio(lexicon[i], lexicon[j], langtes)
                    new_row = pd.DataFrame({"lang": [lang], "w1": [lexicon[i]], "w2": [lexicon[j]], "ratio": [ratio]})
                    synlex_df = pd.concat([synlex_df, new_row], ignore_index=True)
    return synlex_df

In [61]:
### PARAMETERS
MIN_STEM_FREQ = 2 # visualize in func of min stem freq
SYNLEX_THRESHOLD = 0.25

In [10]:
### LEXICON

#lexicon = extract_lexicon(corpus)
#json.dump(lexicon, open(lexicon_fn, 'w'))
lexicon = json.load(open(lexicon_fn, 'r'))
len(lexicon)

6130

In [11]:
### DIRECT TES DICTIONARIES

#dirtes = extract_dirtes(tes, MIN_STEM_FREQ)
#json.dump(dirtes, open(dirtes_fn, 'w'))
dirtes = json.load(open(dirtes_fn, 'r'))

In [58]:
### SYNLEXIFICATIONS
slclx = {lang: compute_language_colex(lexicon, dirtes[lang], tes[lang], 2) for lang in dirtes}
synlex_df = generate_synlex_dataframe(lexicon, slclx, tes)
synlex_df.to_csv(processed_data_path + 'synlex.csv')
synlex_df

removed 0 synlex from 980
980 980
removed 0 synlex from 440
440 440
removed 0 synlex from 1564
1564 1564
removed 0 synlex from 706
706 706
removed 0 synlex from 892
892 892
removed 0 synlex from 2556
2556 2556
removed 0 synlex from 242
242 242
removed 0 synlex from 792
792 792
removed 0 synlex from 964
964 964
removed 0 synlex from 570
570 570
removed 0 synlex from 668
668 668
removed 0 synlex from 722
722 722
removed 0 synlex from 506
506 506
removed 0 synlex from 290
290 290
removed 0 synlex from 666
666 666
removed 0 synlex from 1776
1776 1776
removed 0 synlex from 962
962 962
removed 0 synlex from 916
916 916
removed 0 synlex from 1118
1118 1118
removed 0 synlex from 246
246 246
removed 0 synlex from 432
432 432
removed 0 synlex from 150
150 150
removed 0 synlex from 616
616 616
removed 0 synlex from 496
496 496
removed 0 synlex from 730
730 730
removed 0 synlex from 524
524 524
removed 0 synlex from 458
458 458
removed 0 synlex from 802
802 802
removed 0 synlex from 296
296 296
re

  synlex_df = pd.concat([synlex_df, new_row], ignore_index=True)


Unnamed: 0,lang,w1,w2,ratio
0,anal1239,adult,old,0.000000
1,anal1239,advice,disobey,1.000000
2,anal1239,agree,drop,0.250000
3,anal1239,agree,say,0.250000
4,anal1239,alive,death,0.000000
...,...,...,...,...
32827,yura1255,young,child,0.062500
32828,yura1255,young,people,0.833333
32829,yura1255,young,son,0.000000
32830,yura1255,young,youth,0.000000


In [62]:
### COMPUTING COLEXIFICATION MATRIX

colex = compute_colex(lexicon, dirtes, tes, SYNLEX_THRESHOLD)
dill.dump(colex, open(colex_fn, 'wb'))
#colex = dill.load(open(colex_fn, 'rb'))

lgclx = {lang: compute_language_colex(lexicon, dirtes[lang], tes[lang], SYNLEX_THRESHOLD) for lang in dirtes}
dill.dump(lgclx, open(lgclx_fn, 'wb'))
#lgclx = dill.load(open(lgclx_fn, 'rb'))

removed 260 synlex from 980
980 720
removed 90 synlex from 440
440 350
removed 444 synlex from 1564
1564 1120
removed 226 synlex from 706
706 480
removed 266 synlex from 892
892 626
removed 572 synlex from 2556
2556 1984
removed 82 synlex from 242
242 160
removed 246 synlex from 792
792 546
removed 322 synlex from 964
964 642
removed 184 synlex from 570
570 386
removed 218 synlex from 668
668 450
removed 170 synlex from 722
722 552
removed 220 synlex from 506
506 286
removed 130 synlex from 290
290 160
removed 244 synlex from 666
666 422
removed 314 synlex from 1776
1776 1462
removed 164 synlex from 962
962 798
removed 278 synlex from 916
916 638
removed 188 synlex from 1118
1118 930
removed 84 synlex from 246
246 162
removed 152 synlex from 432
432 280
removed 66 synlex from 150
150 84
removed 128 synlex from 616
616 488
removed 118 synlex from 496
496 378
removed 302 synlex from 730
730 428
removed 136 synlex from 524
524 388
removed 96 synlex from 458
458 362
removed 298 synlex from

In [60]:
print_colex(lexicon, colex)

able : [('align', 1), ('bear', 1), ('collect', 1), ('cure', 1), ('do', 1), ('drop', 1), ('eat', 1), ('find', 2), ('fruit', 1), ('get', 2), ('give', 1), ('go', 3), ('good', 1), ('grandmother', 1), ('grow', 1), ('harvest', 1), ('hear', 1), ('land', 1), ('live', 1), ('look', 1), ('need', 1), ('ointment', 1), ('pick', 1), ('remove', 1), ('say', 1), ('see', 2), ('send', 1), ('soil', 1), ('start', 1), ('story', 1), ('thing', 2), ('tie', 1)]
absorb : [('flower', 1)]
accept : [('agree', 2), ('allow', 1), ('receive', 1), ('want', 1)]
accident : [('explain', 1)]
accompany : [('come', 1), ('put', 1)]
accord : [('agreement', 1), ('day', 1), ('fall', 1), ('go', 1), ('history', 1), ('know', 1), ('people', 1), ('say', 1)]
account : [('take', 1)]
accustomed : [('plant', 1), ('use', 1)]
acquire : [('go', 1)]
act : [('do', 1), ('eat', 1), ('go', 1), ('people', 1), ('person', 1), ('respectful', 1), ('serious', 1), ('way', 1)]
ad : [('come', 1), ('daytime', 1), ('midday', 1), ('use', 1)]
add : [('continue

In [None]:
### PLOT DECREASE OF COLEXES AS WE INCREASE MIN COLEX COUNT

rng = range(1,15)
nb_colexed = [filter_colex(colex, i).nnz for i in rng]
print(nb_colexed)
sns.scatterplot(x=rng, y=nb_colexed)
# plot y on log scale

In [None]:
### PLOT N OF ITEMS IN FUNCTION OF MAX COLEX SIZE

rng = range(172, 0, -2)
nb_items = [filter_colex(colex, 2, i).nnz for i in rng]
sns.scatterplot(x=rng, y=nb_items)

In [None]:
c = filter_colex(colex, 3)

In [None]:
from collections import Counter
lexicon = json.load(open(lexicon_fn, 'r'))

In [None]:
for i, n in Counter(c.nonzero()[0]).most_common():
    print(lexicon[i], n)