In [4]:
import dill
import json
import csv
from collections import defaultdict, Counter

In [2]:
raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'

raw_corpus_fn = raw_data_path + 'doreco_corpus.p'
raw_tes_fn = raw_data_path + 'tes/tes_'
metadata_fn = raw_data_path + 'doreco_files_metadata.csv'

languages_fn = processed_data_path + 'languages.json'
corpus_fn = processed_data_path + 'corpus.pkl'
corpus_narr_fn = processed_data_path + 'corpus_narr.pkl'
corpus_else_fn = processed_data_path + 'corpus_else.pkl'
tes_fn = processed_data_path + 'tes.pkl'
merged_tes_fn = processed_data_path + 'merged_tes.pkl'

In [5]:
def select_genres(corpus, files_metadata, genres):
    metadata = csv.DictReader(open(files_metadata), delimiter='\t')
    metadata = {(x['doculect'][:4],x['file']): (x['genre'],int(x['tokens'])) for x in metadata}
    genre_counts = Counter()
    new_corpus = defaultdict(lambda : {})
    for f in corpus:
        for fi in corpus[f]:
            fx = '_'.join(fi.split('_')[2:])
            genre = metadata[f[:4],fx][0]
            genre_counts[genre] += 1
            if genre in genres:
                new_corpus[f][fi] = corpus[f][fi]
    #print(genre_counts.most_common(10))
    return new_corpus

In [8]:
EXCLUDED_LANGS = {'ligh1234.csv', 'stan1290.csv', 'warl1254.csv'} # languages without TES files (2 not enough data & 1 english)
GENRES = {'personal narrative', 'traditional narrative', 'conversation', 'procedural'}
NARR_GENRES = {'personal narrative', 'traditional narrative'}
ELSE_GENRES = {'conversation', 'procedural'}

raw_corpus = dill.load(open(raw_corpus_fn, 'rb'))
raw_corpus = select_genres(raw_corpus, metadata_fn, GENRES)
corpus = {lang.split('.')[0]: entry for lang, entry in raw_corpus.items() if lang not in EXCLUDED_LANGS}
corpus_narr = select_genres(corpus, metadata_fn, NARR_GENRES)
corpus_else = select_genres(corpus, metadata_fn, ELSE_GENRES)
languages = sorted(list(corpus.keys()))
tes = {lang.split('.')[0]: dill.load(open(raw_tes_fn + lang + '.csv.p', 'rb')) for lang in languages}

In [9]:
json.dump(languages, open(languages_fn, 'w'))
dill.dump(corpus, open(corpus_fn, 'wb'))
dill.dump(corpus_narr, open(corpus_narr_fn, 'wb'))
dill.dump(corpus_else, open(corpus_else_fn, 'wb'))
dill.dump(tes, open(tes_fn, 'wb'))

In [7]:
import networkx as nx
from itertools import combinations, chain
from editdistance import eval as ed
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np
from collections import Counter

import gensim.downloader
wv = gensim.downloader.load('word2vec-google-news-300')
# train w2v on TES

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

def merge_similar_tes_within(tes, te_words):
    merge = nx.Graph()
    merge.add_nodes_from(tes)
    tes_new, te_words_new = {}, {}
    for t1, t2 in combinations(tes,2):
        if t1 in t2 or t2 in t1 or ed(t1,t2) <= 1:
            merge.add_edge(t1,t2)
    for c in nx.connected_components(merge):
        best_c = max(c, key= lambda k : len(tes[k]))
        tes_new[best_c] = list(chain(*[tes[ci] for ci in c]))
        te_words_new[best_c] = sum([te_words[ci] for ci in c], Counter())
        #print('>>> MERGED', c)
    return tes_new, te_words_new

def merge_similar_tes_across(tes, te_words):
    merge_markers = nx.Graph()
    te_marker_pairs = [(te, marker) for te in tes for marker in tes[te] if len(tes[te][marker]) >= 5]
    for (tei,mrki),(tej,mrkj) in combinations(te_marker_pairs,2):
        if tei==tej: continue
        if set(te_words[tei][mrki]) & set(te_words[tej][mrkj]) != set():
            mrkic, mrkjc = mrki.strip('^$'), mrkj.strip('^$')
            minlen = min(len(mrkic),len(mrkjc))
            wi = te_words[tei][mrki]
            wj = te_words[tej][mrkj]
            jsd = (JSD(*zip(*[[wi[x],wj[x]] for x in sorted(set(wi)|set(wj))])))
            cos = (wv.similarity(tei,tej))
            jac = len(set(wi)&set(wj))/len(set(wi)|set(wj))
            form = mrkic[:minlen] == mrkjc[:minlen] or mrkic[-minlen:] == mrkjc[-minlen:]
            if cos >= 2/3 and jsd <= 2/3 and form:
                #print((tei,mrki),(tej,mrkj),'JSD=%.2f COS=%.2f JAC=%.2f FORM=%d' % (jsd, cos, jac,form))
                merge_markers.add_edge((tei,mrki), (tej,mrkj))
    for c in nx.connected_components(merge_markers):
        new_mrk = min([mrk for te,mrk in c],key = len)
        for te,mrk in c:
            tes[te][new_mrk] = tes[te][mrk]
            te_words[te][new_mrk] = te_words[te][mrk]
            if mrk != new_mrk: del tes[te][mrk], te_words[te][mrk]
    return tes, te_words

def merge_similar_tes(langtes):
    tes, te_words = langtes
    for k,v in tes.items():
        tes[k], te_words[k] = merge_similar_tes_within(v, te_words[k])
    #
    tes, te_words = merge_similar_tes_across(tes, te_words)
    return tes, te_words

In [9]:
merged_tes = dict()
for lang in tes:
    merged_tes[lang] = merge_similar_tes(tes[lang])

KeyError: "Key 'coque' not present"

In [10]:
merged_tes

{'anal1239': ({'say': {'do': [np.int32(43),
     np.int32(46),
     np.int32(74),
     np.int32(123),
     np.int32(130),
     np.int32(154),
     np.int32(184),
     np.int32(343),
     np.int32(354),
     np.int32(372),
     np.int32(517),
     np.int32(561),
     np.int32(652),
     np.int32(703),
     np.int32(747),
     np.int32(753),
     np.int32(815),
     np.int32(830),
     np.int32(835),
     np.int32(837),
     np.int32(840),
     np.int32(846),
     np.int32(884),
     np.int32(903),
     np.int32(909),
     np.int32(910),
     np.int32(920),
     np.int32(921),
     np.int32(926),
     np.int32(941),
     np.int32(942),
     np.int32(945),
     np.int32(950),
     np.int32(984),
     np.int32(1224),
     np.int32(1233),
     np.int32(1239),
     np.int32(1295),
     np.int32(1392),
     np.int32(1493),
     np.int32(1534),
     np.int32(1535),
     np.int32(1632),
     np.int32(1697),
     np.int32(1698),
     np.int32(1887),
     np.int32(1960),
     np.int32(1965),
    

In [None]:
dill.dump(merged_tes, open(merged_tes_path, 'wb'))