# Data Prep DoReCo

This script creates a pickle file containing a prepared version of the DoReCo corpus, removing some irregularities and characteristics that would make subsequent processing more difficult.

In [43]:
import csv
import os
from collections import defaultdict, Counter
import dill as pickle
import numpy as np
import time

from langdetect import detect
import re
from googletrans import Translator
translator = Translator()#
import spacy
nlp = spacy.load('en_core_web_lg')
#
REP = dict([l.strip('\n').split(',') for l in open('manual_corrections.csv')])
dn = '/home/barend/Google Drive/data/doreco/' # path to corpus files

## 1. get raw corpus and preprocess

extracts for all csv-s with an 8-character language code an initial unprocessed corpus, consisting of the free translation (ft), the text (tx) and the glosses, where each gloss consists of the word w, the word index wi, the morpheme array M, the gloss array G, the part-of-speech array P, the start time and the end time.

In [44]:
punct = '.,?\"§«»…*()[]–. \t —/’`'
exclusion_markers = {'<p:>', '', '[xxx]', '****', 'xxx', '***', 'X', '.', "'.'"}
parenthetical_content = '(=|i.e.|\*|\?|…|[Ll]it.|alt:|\d+|sic|[A-Z]{2,}|noise|laughter|inaudible|unclear|or|---)'

def clean_wr(wr,lg):
    wrx = re.sub('<<.*?>(.*)>', r'\1', wr).strip(punct + '-').rstrip('!').lower()
    wrx = wrx.strip("'") if lg[:4] not in {'arap', 'goem', 'vera', 'trip'} else wrx
    return wrx
    
def clean_tx(tx,lg):
    tx = re.sub('\(\?\)', '', tx)
    tx = re.sub('#', '', tx)
    return ' '.join(filter(lambda w : w not in exclusion_markers and w[0] not in '<',
                           map(lambda w : clean_wr(w,lg), tx.split())))

def clean_gl(gl):
    gl = re.sub(r'\bsaid\b', 'say', gl)
    gl = re.sub(r'\bmen\b', 'man', gl)
    return gl.strip('".-=?')

In [52]:
corpus = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : {})))
for d in filter(lambda d: d[8:12] == '.csv', sorted(os.listdir(dn))):
    matches = []
    for l in list(csv.DictReader(open(dn + d))):
        lg,fi,li,wi = l['lang'], l['file'], l['ref'], l['wd_ID']
        tx = clean_tx(l['tx'],lg)
        ft = l['ft']
        wr = clean_wr(l['wd'],lg)
        #
        if ft in exclusion_markers or re.match('.*[a-zЁёА-я]', ft) == None: continue
        if tx in exclusion_markers or tx[0] == '*?.#': continue
        if wr in exclusion_markers or wr[0] == '<': continue
        # weeds out bad FT, TX and WR
        #  
        M = list(filter(lambda m : m not in exclusion_markers, l['mb'].lower().split())) if re.match('\*+', l['mb']) == None else []
        G = (list(map(lambda m : clean_gl(m),
                     filter(lambda m : m not in exclusion_markers, l['gl'].split()))) 
             if re.match('\*+', l['gl']) == None else [])
        if len(M) != len(G) and len(M) == 0 or len(G) == 0:
            M,G = [], []
        corpus[d][fi][li]['gloss'] = (corpus[d][fi][li].get('gloss', []) + [{'w':wr,'wid':wi,'M':M,'G':G}])
        corpus[d][fi][li]['ft'] = ft
        corpus[d][fi][li]['tx'] = tx
    nlines = len([l for f in corpus[d] for l in corpus[d][f]])
    glosses = Counter((gi for f in corpus[d] for l,elt in corpus[d][f].items() for g in elt['gloss'] for gi in g['G']))
    print(d, nlines, len(glosses))

anal1239.csv 2848 0
apah1238.csv 4411 641
arap1274.csv 3158 2135
bain1259.csv 2926 1029
beja1238.csv 6308 2036
bora1263.csv 3820 1681
cabe1245.csv 2052 1949
cash1254.csv 1764 1129
dolg1241.csv 2428 1982
even1259.csv 2311 1667
goem1240.csv 2533 856
goro1270.csv 4339 1465
hoch1243.csv 971 1094
jeha1242.csv 1358 469
jeju1234.csv 1958 1173
kaka1265.csv 4606 1179
kama1351.csv 9284 1473
kark1256.csv 2187 0
komn1238.csv 7524 4348
ligh1234.csv 1437 0
lowe1385.csv 1149 0
movi1243.csv 2178 1130
ngal1292.csv 1677 564
nisv1234.csv 1675 1054
nngg1234.csv 6224 1147
nort2641.csv 834 1655
nort2875.csv 2693 878
orko1234.csv 2911 647
pnar1238.csv 930 1175
port1286.csv 1301 719
resi1247.csv 1248 0
ruul1235.csv 2924 1833
sadu1234.csv 1456 0
sanz1248.csv 699 758
savo1255.csv 779 751
sout2856.csv 1548 1173
sout3282.csv 2271 0
stan1290.csv 0 0
sumi1235.csv 2472 565
svan1243.csv 1299 0
taba1259.csv 632 1108
teop1238.csv 1970 1034
texi1237.csv 2818 726
trin1278.csv 1843 1334
tsim1256.csv 1591 0
urum1249.csv 22

### 2. gloss-morph alignment

Simple segmentation of glosses and morphemes leaves several languages with imperfect gloss-morph alignments (per word). This code probabilistically resolves this.

In [54]:
import networkx as nx
from itertools import product

def get_dtw(A, B, P):
    dtw = np.ones((len(A),len(B)))
    path = [[[] for j in range(len(B))] for i in range(len(A))]
    #
    for i in range(len(A)):
        for j in range(len(B)):
            p = P[A[i],B[j]]
            if i > 0 and j > 0 and dtw[i-1,j-1] >= dtw[i,j-1] and dtw[i-1,j-1] >= dtw[i-1,j]:
                path[i][j] = path[i-1][j-1] + [(i,j)]
                dtw[i,j] = dtw[i-1,j-1] * p
            elif i > 0 and (j == 0 or dtw[i-1,j] >= dtw[i,j-1]):
                path[i][j] = path[i-1][j] + [(i,j)]
                dtw[i,j] = dtw[i-1,j] * p
            else:
                path[i][j] = path[i][j-1] + [(i,j)]
                dtw[i,j] = dtw[i,j-1] * p
            #print(i,j,dtw[i,j],path[i][j])
    G = nx.Graph()
    for i,j in path[-1][-1]:
        G.add_edge((0,i),(1,j))
    clusters = [(tuple(map(lambda x : A[x[1]], sorted(filter(lambda x : x[0] == 0, c), key = lambda x : x[1]))),
                 tuple(map(lambda x : B[x[1]], sorted(filter(lambda x : x[0] == 1, c), key = lambda x : x[1]))))
                for c in sorted(nx.connected_components(G))]
    return clusters

def get_pjoint_MG(corpus, doc):
    MG,GM = defaultdict(lambda : Counter()),defaultdict(lambda : Counter())
    for f in corpus[doc]:
        for l in corpus[doc][f]:
            for g in corpus[doc][f][l]['gloss']:
                M = g['M']
                G = [gs for g in g['G'] for gs in split_g(g)]
                for m,g in product(M,G):
                    MG[m][g] += 1
                    GM[g][m] += 1
    pMiG = { g : Counter({ m : v/sum(GM[g].values()) for m,v in GM[g].items()}) for g in GM}
    pGiM = { m : Counter({ g : v/sum(MG[m].values()) for g,v in MG[m].items()}) for m in MG}
    pjoint = { (m,g) : pMiG[g][m] * pGiM[m][g] for g in GM for m in GM[g]}
    return pjoint

def split_g(g):
    gn = re.sub('([A-Z])/(\w)', r'\1.\2', g.strip('-='))
    gn = re.sub('(\w)/([A-Z])', r'\1.\2', gn)
    gni = re.split('[/=\-._:,()\\\\\[\]~\<\>+\sː]', gn)
    return [gsub.strip('?!/;,.\\*%"\'`') for gsub in gni if gsub.strip('?!/;,.\\*%"\'`') != '']

In [55]:
def morph_gloss_alignment(corpus, verbose=1):
    for f in corpus:
        if verbose > 0: 
            print('='*9,f.upper())
        P = get_pjoint_MG(corpus, f)
        i = 0
        for fi in corpus[f]:
            for li in corpus[f][fi]:
                for gi, g in enumerate(corpus[f][fi][li]['gloss']):
                    M = list(filter(lambda g : g not in {'<p:>','****', '***'}, g['M']))
                    G = list([gs for g in (g['Gt'] if 'Gt' in g else g['G']) for gs in split_g(g)])
                    #
                    if len(M) == len(G): 
                        MGa = [((a,),(b,)) for a,b in zip(M,G)]
                    elif len(G) > 0 and len(M) > 0: 
                        MGa = get_dtw(M,G,P)
                        i += 1
                        if verbose > 1 and i < 10: print(f[:4], '\t', M, '\n\t', G, '\n\t', ['.'.join(m) for m,g in MGa], ['.'.join(g) for m,g in MGa])
                    else: MGa = []
                    corpus[f][fi][li]['gloss'][gi]['M'] = ['.'.join(m) for m,g in MGa]
                    corpus[f][fi][li]['gloss'][gi]['G'] = ['.'.join(g) for m,g in MGa]
    return corpus

corpus = morph_gloss_alignment(corpus, verbose = 0)

### 3. add English translations for non-English free translations and glosses

Free Translations in languages other than English are detected and translated, with Google Translate, into English.

In [56]:
def detect_ft_language(corpus, f, fi, verbose=False):    
    ft_lg = Counter()
    for li in corpus[f][fi]:
        ft = corpus[f][fi][li]['ft']
        try: ft_lg[detect(ft)] += 1
        except: pass 
    if verbose: print(ft_lg.most_common(3))
    if len(ft_lg) == 0: return None, 0
    top_lg, certainty = next(((f,c) for f,c in ft_lg.most_common() if f != None),(None,sum(ft_lg.values())))
    return top_lg, certainty/sum(ft_lg.values())

def add_ft_translation(corpus, verbose = 1):
    for doc in corpus:
        print(doc)
        for f in corpus[doc]:
            ft_buffer, ft_keys = '', []
            top_lg, certainty = detect_ft_language(corpus, doc, f, False)
            if top_lg == 'en': continue
            for l in corpus[doc][f]:
                ft = corpus[doc][f][l]['ft']
                if len(ft_buffer) + len(ft) + 1 > 5000:
                    print('  translating', doc, f, top_lg, '\n', len(ft_buffer), ft_buffer[:80].replace('\n', ' | '))
                    tr = translator.translate(ft_buffer, 'en', top_lg).text
                    for tri, lx in zip(tr.split('\n'), ft_keys):
                        corpus[doc][f][lx]['ft'] = tri
                    ft_buffer, ft_keys = '', []
                    time.sleep(0.5)
                ft_buffer += ft + '\n'
                ft_keys.append(l)
                #
            if ft_buffer != '':
                print('  translating', doc, f, top_lg, '\n', len(ft_buffer), ft_buffer[:80].replace('\n', ' | '))
                tr = translator.translate(ft_buffer, 'en', top_lg).text
                for tri, lx in zip(tr.split('\n'), ft_keys):
                    corpus[doc][f][lx]['ft'] = tri
    return corpus 

corpus = add_ft_translation(corpus, verbose = 1)

anal1239.csv
apah1238.csv
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4965 saya sedang bertanya kepada Kristian | apakah saya harus mengulang dari apa yang s
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4989 karena ada salah satu mereka yang mengganggu dia | kami turun ke Kulet | mereka dapa
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4995 ya, mereka sedang bawa itu | mereka masak makanan sambil menanti mereka turun | bapa
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4976 mereka di sana juga | karena kami dengar dari radio | bapak Rot yang memberitahukann
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4968 karena Awanon sudah duluan ke sana | di sana mereka sudah mendengarnya | karena mere
  translating apah1238.csv doreco_apah1238_interview_IP_Ware id 
 4994 mereka berjaga di bawah, dari mana mereka akan turun ke sini bersama-sama | tempat
  translating apah1238.csv doreco_apah1238

    FT: y se fue a colgarse en un palo and went to hang on a stick ab
    FT: y cuando amaneció se despertó  and when dawn woke up the pucu
    FT: ¿qué voy a hacer ahora, cómo h What am I going to do now, how
    FT: y otra vez comenzó a llorar di and again he began to cry sayi
    FT: al decir así levanta la mirada Saying this, he looks up and s
    FT: sí, el útáácajɨ yes, the útáácajɨ
    FT: y dijo: éste me chupo anoche y and said: this one sucked me l
    FT: al momento le tiró un pucunazo at the moment he threw a pucun
    FT: y éste lo recibió por el hueco and he received it through the
    FT: después se fue otra vez con fu then he left again strong to w
    FT: caminaba, caminaba sin direcci He walked, he walked without d
    FT: unos golpeaban el manguaré tat Some beat the manguaré tatíjta
    FT: vaya dijo, ellos están en una  wow he said, they are at a par
    FT: después oye cantos típicos then listen to typical songs
    FT: sí Yeah
    FT: y decían: la mejilla del pate,

    FT: luego de decirle así emprendie After telling him so, they too
    FT: luego se van volando otra vez  then they go flying up and dow
    FT: como pago de este favor que ll as payment for this favor that
    FT: ¿y qué haré con esta hacha? le and what shall I do with this 
    FT: y éste le dijo: (con esta hach and he told him: (with this ax
    FT: muy bien gracias very well thank you
    FT: luego el pucunero le arregló l then the pucunero fixed the cr
    FT: y cuando veas a la gente camin and when you see people walkin
    FT: este era el pago del favor this was the payment of the fa
    FT: después de haberlo pagado se f After having paid for it, he w
    FT: pero en ese momento su marido  but at that moment her husband
    FT: sí Yeah
    FT: al verlo sola le llevó robándo seeing him alone took him stea
    FT: sí Yeah
    FT: pues, él era loco well he was crazy
    FT: robándole a ella se fueron y p stealing from her, they went a
    FT: llegando a un lugar el pucuner Arri

    FT: en seguida sacando su macana q immediately taking out his clu
    FT: listo, dijo, para nada por qué Done, he said, not at all why 
    FT: mientras ellos decían a su hij while they said to their son: 
    FT: después de comer, comenzó a ta After eating, he began to carv
    FT: y de este hueso de aquí él inv And from this bone here he inv
    FT: mientras que de los huesos de  while from the bones of his le
    FT: y toda la caja torácica que er and the entire ribcage, which 
    FT: mientras que su cuero, después while its skin, after eating, 
    FT: de esta forma dió origen a los in this way he gave rise to fi
    FT: y así como dice su cuero se tr and just as its skin says, it 
    FT: ellos dijeron; listo, él, el e they said; That's it, he, the 
    FT: las generaciones futuras en tu future generations will carry 
    FT: después él cuando se fué le di Then, when he left, he told hi
    FT: por esta razón cuando eres una For this reason, when you are 
    FT: por esta raz

    FT: Por eso recién ahora, envejeci That is why only now, getting 
    FT: No se hace de acuerdo a la med It is not done according to th
    FT: Más acacito de lo que se jala, More acacito than what is pull
    FT: Así la viga en el medio, está  Thus the beam in the middle is
    FT: Y así poco a poco voy sabiendo And so little by little I know
    FT: Ahora yo creo que estoy hacien Now I think I'm doing much bet
    FT: Pero no, pensamos sí lo he ach But no, we think yes I have sh
    FT: Yo pienso que está del mismo t I think it is the same size.
    FT: Y ahora a mí, falta arreglarme And now I need to fix myself.
    FT: Ahora está todavía un poco des Now it's still a bit messy.
    FT: Allá todavía cuando hago mi ta There, when I make my stage, I
    FT: Con eso ya se va a quedar la c With that, the house will be l
    FT: De esa forma nosotros hemos ar In this way we have arranged t
    FT: Y en ese momento también tu es And at that moment you are als
    FT: Y así mismo tambié

    FT: No, dijo él, acaso aquí vamos  No, he said, maybe we're going
    FT: A ti te voy a llevar muy lejos I'm going to take you very far
    FT: No, seguía diciendo ella. No, she kept saying.
    FT: Pero le insistió a ella, hasta But he insisted on her, until 
    FT: Ella ya, pues, le ha creído. She already, then, has believe
    FT: Entonces así como tu dices, ya So, just as you say, well, don
    FT: No, dice él. No, he says.
    FT: A ella abrazando de su misma c To her hugging her own cocamer
    FT: De allá fueron a aparecer. From there they went to appear
    FT: Por el corazón de huacrapona f For the heart of huacrapona th
    FT: Otra vez por el corazón de un  Again through the heart of a t
    FT: Otra vez debajo de la tierra,  Under the ground again, under 
    FT: De allá se fueron a aparecer e From there these two went to a
    FT: Dentro de palo, jm, muy lejos. Within stick, jm, very far.
    FT: Ya está, le dijo, hasta aquí e That's it, he told her, he's n
    FT: El

    FT: Y así de verdad aunque estabe  And really, even though he was
    FT: Y ya se ha podrido su barriga, And his belly has already rott
    FT: Ya le mató already killed him
    FT: Nada mas el pensaba en matarlo Nothing else he thought of kil
    FT: El decía, hoja de barbaso yo e He used to say, barbasco leaf 
    FT: Yo estoy hechado el dice lo qu I am cast, he says what he wil
    FT: Para eso no mas el hacía mɨ́ɨ́ For that, he did mɨ́ɨ́vajɨ, fo
    FT: Después de eso ya pues el, a s After that, he, his wife from 
    FT: Despues de que murió. After he died.
    FT: Despues de que mató. After he killed
    FT: Y después de todo, recién, ell And after all, recently, their
    FT: Por eso se rabiaron contra el, That's why they raged against 
    FT: Un dia ellos, cuando le rodear One day they, when their house
    FT: Dizque era su escopeta, tarde  They say it was his shotgun, w
    FT: Cuando estaba este todo parado When everything was stopped we
    FT: Así mismo el hablaba lik

    FT: Las comidas también les nombró The meals also named them as i
    FT: Sí, su agua también él como po Yes, his water also he as itch
    FT: A él dice, al espíritu trabaja He says to him, to the hardwor
    FT: Sólo ya, para que cuide la tie Only now, to take care of the 
    FT: Ahí pues esas cositas creó él. That's where he created those 
    FT: Esto lo que llamamos enfermeda This is what we call disease.
    FT: Ahí pues es esto, lo que es pe There, then, is this, what is 
    FT: Esto es un mal peligroso. This is a dangerous evil.
    FT: En el viento anda. In the wind it walks
    FT: Unas veces, como dijo él, entr Sometimes, as he said, it goes
    FT: Estando adentro, a veces se co Being inside, sometimes it eat
    FT: El polvo de su manteca, el pol The powder of his lard, the po
    FT: Y lo principal, estas cositas, And the main thing, these litt
    FT: Y esto es parte de la alimenta And this is part of the diet, 
    FT: Y eso nos hace daño and that hurts us
    FT:

    FT: Y los padres sin decir, ni hac And the parents without saying
    FT: Y vivían tranquilos y él suegr And they lived in peace and hi
    FT: Comían, comían y comían, ya se They ate, ate and ate, their c
    FT: Ya está, ya no ya demasiado es That's it, not too much anymor
    FT: Ya esta, mujer, le dice, cómo  Now here, woman, he tells her,
    FT: Entonces, muy bien, de lo que  Then, very well, from what was
    FT: Y se fue, llegó, se levantó, l And he left, arrived, got up, 
    FT: Y su hija, dentro de la hamaca And her daughter, inside her c
    FT: Y debajo de él, su hija. And below him, his daughter.
    FT: ¡qué! el dijo. that! he said he.
    FT: Ya, padre, le dice, has venido Now, father, he tells him, you
    FT: Sí. Yeah.
    FT: Suegro, has venido. Father-in-law, you have come.
    FT: Sí ááh. Yes ááh.
    FT: Siéntate ahí, siéntate ahí. Sit there, sit there.
    FT: Y él dice pensaba que su hija  And he says he thought his dau
    FT: Cuando el padre se sentó, el c

    FT: Ahora este pleito recién voy a Now I'm just going to do this 
    FT: No suegro no suegro, te voy a  No father-in-law, no father-in
    FT: Sacando el envase del alimento Taking out his wife's food con
    FT: Toma, le dijo, ahora te voy a  Here, he said, now I'm going t
    FT: Él empezó a tomar, a tomar le  He began to drink, to drink he
    FT: Ahora te haré ver tu sangre re Now I'll make you see your fre
    FT: El intentó terminar, y tomó, t He tried to finish, and took, 
    FT: El antecesor de las charcos, l The ancestor of the puddles, n
    FT: Por eso el charco que está baj That's why the puddle that is 
    FT: Y a ese le nombró. And he named that one.
    FT: Entonces, ya cuando estaba en  Then, when he was in the middl
    FT: Cojdocodo (ganas de vomitar) a Cojdocodo (want to vomit) befo
    FT: Sí, ya está. Yes, that's it.
    FT: Le dió el envase con la sobra. He gave her the container with
    FT: Es vergonzoso, ante tus parien It's embarrassing, in front of
   

    FT: Abuelo no mires, le dijo, aquí Grandpa, don't look, she told 
    FT: Y mientras él cerraba los ojos And while he closed his eyes h
    FT: En aquellas épocas esas casas  In those times those houses we
    FT: Y también cerraron la puerta. And they also closed the door.
    FT: Y cuando lo hubo tirado al sue And when he had thrown it to t
    FT: Y sus huesos andaba buscando s And his bones were looking for
    FT: Y él iba cantando o tal vez ll And he was singing or maybe cr
    FT: Diciendo: mis nietos, vino a d Saying: my grandchildren, he c
    FT: Y les advirtieron: ustedes no  And they warned them: you do n
    FT: ¿A cuantos ya comieron ustedes How many have you already eate
    FT: Solamente comimos callampas, a We only ate mushrooms, grandfa
    FT: Y donde están los huesos, les  And where are the bones, I tol
    FT: No abuelo, le dijeron, como es No grandfather, they told him,
    FT: Entonces otra vez, ya, les dec Then again, already, I told th
    FT: Entonces voy 

    FT: Allí ella la sachaperro se mur There she the sachaperro died.
    FT: El yangunturo la mató. The yangunturo killed her.
    FT: El la mató, le despedazó. He killed her, he tore her to 
    FT: Entonces si corrió la mujer ve So the deer woman ran and, arr
    FT: Así fue que la mujer venado, l So it was that the woman, deer
    FT: Así es la historia de ese asun This is the history of that ma
[('es', 68), ('pt', 2), ('tl', 1)]
   doreco_bora1263_ovehe_1 bora1263.csv
    FT: Había dice una mujer perezosa There was a lazy woman says
    FT: Dice que su marido mandó tumba She says that her husband orde
    FT: no quería preparar cahuana no  I didn't want to prepare cahua
    FT: y ella se fue a cazar, se fue  and she went hunting, she went
    FT: y algunos de los que se iban a and some of those who went to 
    FT: Pero mientras tu marido está h But while your husband is layi
    FT: Por eso ella les contestó: seg That's why she answered them: 
    FT: Ella estaba confiada en su m

    FT: mamá, mamá pero yo te dejé par mom, mom but I left you to cul
    FT: Que pues hijo, qué hombre quiz So son, what man perhaps tells
    FT: Por lo que me dice le doy vuel From what he tells me, his hea
    FT: Y por eso me desvelo. And that's why I stay awake.
    FT: Que sería, es el papaso de sur What would it be, is the papas
    FT: Ahora cuando viene darás vuelt Now when it comes you will tur
    FT: Y así lo hizo ella, dándole vu And so she did, turning around
    FT: Ahora cuando ella durmió bien  Now when she slept well that n
    FT: Trajo muchas aves. He brought many birds.
    FT: Y le dijo: abuela cocina mis p And he said: Grandma cooks my 
    FT: Entonces ella los cocino. So she cooked them.
    FT: Échale bastante ají. Add enough chili.
    FT: Entonces ella puso ají y picab So she put chili pepper and it
    FT: Ella lo bajo del fuego. She lowered it from the fire.
    FT: Ya, hijo, ven a comer. Now, son, come eat.
    FT: Ella pensó que comería los dos She thou

    FT: Que seria es el venado de los  How serious is the deer of the
    FT: Ahora torceré para ti fibra de Now I will twist chambira fibe
    FT: La fibra de chambira de nuestr I will make the chambira fiber
    FT: Y lo traerás. And you will bring it.
    FT: Cuando tienes que pasar esta n When you have to pass this our
    FT: Abuelo, abuelo, cierra duro lo Grandfather, grandfather, clos
    FT: Aquí le dirás cuando te agache Here you will tell him when yo
    FT: así se fue el llevando la cuer so he left carrying the rope.
    FT: Yendo lo ató cuando el hubo ve Going tied him when he had com
    FT: Y mientras lo cargaba, el dorm And while carrying him, he sle
    FT: El, entonces lo ato ahí con la He, then I tie it there with t
    FT: Y después cargándolo vino, vin And after loading it, he came,
    FT: Y le dijo: abuelo, nuestro cam And he told him: grandfather, 
    FT: Entonces el cerro quizás muy d Then the hill maybe very hard 
    FT: Y le vino a botar en media cas And he 

    FT: Pues, es nuestro creador, es e Well, he is our creator, he is
    FT: Espero que a cuál abuelo, él l I hope which grandfather, he h
    FT: Espero que nos indica I hope it tells us
    FT: Esta nuestra ofrenda de mitaya This is our mitayar offering, 
    FT: Así mismo también para que él  Likewise also for him to call 
    FT: De nuestra cochita, él para qu Of our puss, he to come down.
    FT: Nosotros, los nietos de él, la We, his grandsons, his grandda
    FT: Nosotros comiendo recién los n We just eating his grandsons, 
    FT: Y de esa mitayada, espero que  And from that mitayada, I hope
    FT: No nos va a pasar ningún accid No accident will happen to us.
    FT: Ese abuelo, también de este la That grandfather, also on this
    FT: Con su calor ese veneno de raí With its heat that barbasco ro
    FT: Le hace visitar, la casa de ag He makes him visit the water h
    FT: Con esa calor, esos peces que  With that heat, those fish tha
    FT: que haga sol de sequía para no mak

    FT: Dice que él cuando él colaba s He says that when he strained 
    FT: Y a él también se lo esconde c And he also hides it when he d
    FT: Estos fueron los que hicieron  These were the ones that made 
    FT: Y a todos ellos se los esconde And all of them are hidden
    FT: Cuando todos estos están en me When all these are in our mids
    FT: Por eso se debe tener la estra That is why those of us who wo
    FT: “áh, dice, “de esta forma será “ah, he says, “this is how it 
    FT: el dijo dentro de su corazón,  he said inside his heart, it s
    FT: A todos ellos se los esconde They are all hidden
    FT: Después vino el hijo de la lag Then came the son of the sky s
    FT: “abuelo”, le dice "Grandpa", he says
    FT: “hijo, ¿me estás visitando?” “son, are you visiting me?”
    FT: “sí, ¿es este tu coca abuelo?” "Yes, is this your coca grandp
    FT: “si, nieto”, le dice, “no toda "Yes, grandson," he says, "I h
    FT: “yo voy a tostar para ti abuel "I'm going to toast for you,

    FT: Queriamos tocar o rabo do tatú We wanted to touch the armadil
    FT: Aí eles sempre brincavam e imi There they always played and i
    FT: Aí a criança inteligente estav Then the smart child was learn
    FT: De lá mudaram para uma outra a From there they moved to anoth
    FT: Aí vivia a minha avó. There my grandmother lived.
    FT: Meu avô materno e minha avó vi My maternal grandfather and my
    FT: Aí vivia o irmão mais velho da My mother's older brother live
    FT: O nome da aldeia era Juruparí. The name of the village was Ju
    FT: Fez o roçado em Juduparí. He made the swidden in Judupar
    FT: Ele fez o roçado. eles foram. He did the scraping. they went
    FT: Aí minha avó aí mesmo no braço Then my grandmother right ther
    FT: Ela nos abandonou, morreu. She abandoned us, died.
    FT: Ela morreu, (eles) não queriam She died, (they) didn't want t
    FT: Na beira onde ela morreu (o lu On the brink where she died (t
    FT: Aí atravessamos o igarapé cham There we c

In [57]:
def add_gl_translation(corpus):
    gloss_lg = {'EVEN1259':'ru', 'NISV1234':'fr', 'TEXI1237':'es'}
    for f in corpus:
        if f[:8].upper() not in gloss_lg: continue
        glossct = Counter([gi for fi in corpus[f] for li in corpus[f][fi]
                           for g in corpus[f][fi][li]['gloss'] for gi in g['G']])
        print(f, len(glossct), glossct.most_common(10))
        cr = [w for w,c in glossct.most_common()]
        cr2 = cr.copy()
        tr = []
        while cr != []:
            crstr = ''
            while cr != [] and len(crstr) + len(cr[0]) + 1 < 5000:
                crstr += cr.pop(0) + '\n'
            crstr.strip('\n')
            print(len(cr))
            tr += translator.translate(crstr, 'en', gloss_lg[f[:8].upper()]).text.split('\n')
        dic = {w:t for w,t in zip(cr2,tr)}
        for fi in corpus[f]:
            for li in corpus[f][fi]:
                for gi,g in enumerate(corpus[f][fi][li]['gloss']):
                    corpus[f][fi][li]['gloss'][gi]['G'] = [dic[gx] for gx in g['G']]
    return corpus
        
corpus = add_gl_translation(corpus)

even1259.csv 1727 [('PL', 1062), ('3SG', 830), ('NFUT', 734), ('IPFV', 685), ('1SG', 651), ('ACC', 591), ('PST', 424), ('быть', 390), ('PANT', 390), ('FOC', 372)]
1063
527
13
0
nisv1234.csv 1107 [('3SG', 1499), ('3PL', 566), ('INTR', 467), ('aller', 436), ('COO.VB', 398), ('DET', 361), ('1SG', 263), ('dire', 254), ('NOM', 243), ('ASP.A', 243)]
471
0
texi1237.csv 742 [('3.A', 1649), ('decir', 1276), ('PFV', 829), ('3.3', 498), ('FUT', 426), ('1.3', 334), ('no', 317), ('3.PL', 294), ('APL', 278), ('ya', 245)]
0


## clean FT

needs to happen after translation

In [58]:
def clean_ft(ft, doc):
    punct = '.,;:\'\"!\?'
    brackets = '\[\]\(\)\<\>'
    #
    ft = ' '.join([(w.strip('-…') if w not in REP else REP[w]) for w in ft.split(' ')])
    #ft = re.sub('([^-])[-]*(\w+)[-]*([^-])', r'\1\2\3', ft) 
    # remove trailing dashes
    ft = re.sub('(\w)[-_](\w)', r'\1 \2', ft) 
    # splits hyphenated and underscore-linked words
    ft = re.sub('(\'\'|``|’’|“|”)', '"', ft)
    # normalize quote chars
    ft = re.sub('[`‘’´]', "'", ft)
    # normalize apostrophes
    ft = re.sub('ː', ':', ft)
    # normalize colons
    ft = re.sub('\(' + parenthetical_content + '.*?\)', '', ft)
    ft = re.sub('\[' + parenthetical_content + '.*?\]', '', ft)
    ft = re.sub('\<' + parenthetical_content + '.*?\>', '', ft)
    #
    ft = re.sub('([\[\(\<])\s*', r' \1', ft)
    ft = re.sub('\s*([\]\)\>])', r'\1 ', ft)
    
    # remove comments and other parentheticals [that aren't textual completions]
    # ft = re.sub('[()]','', ft)
    ft = re.sub('â', '', ft)
    ft = re.sub('\s+([%s])' % punct, r'\1', ft)
    ft = re.sub('(\w)([%s])(\w)' % (punct+brackets), r'\1 \2 \3', ft)
    ft = re.sub('(\w)([%s])([%s])(\w)' % (punct+brackets,punct+brackets), r'\1 \2 \3 \4', ft)
    ft = re.sub('(\w)([%s])([%s])([%s])(\w)' % (punct+brackets,punct+brackets,punct+brackets), r'\1 \2 \3 \4 \5', ft)
    ft = re.sub('[<>]\??', '', ft) 
    ft = re.sub('\s+', ' ', ft)
    ft = re.sub('#', '', ft)
    ft = ft.strip(' _/')
    ft = re.sub(" ' (ve|t|m|s|d|ll|re|clock)", r"'\1", ft)
    if doc[:4] == 'bain': ft = re.sub('.*?\|', '', ft)
    # remove duplicate spaces
    return ft

In [59]:
for doc in corpus:
    for file in corpus[doc]:
        for line,elt in corpus[doc][file].items():
            corpus[doc][file][line]['ft'] = clean_ft(elt['ft'], doc)

## finding further errors

### 4. add spacy data

In [60]:
def get_features(w):
    return {'text' : w.text, 'i' : w.i, 'idx' : w.idx, 'lemma' : w.lemma_, 'pos' : w.pos_,
            'tag' : w.tag_, 'head.i' : w.head.i, 'dep' : w.dep_}

for doc in corpus:
    print(doc)
    for f in corpus[doc]:
        for l in corpus[doc][f]:
            e = corpus[doc][f][l]
            e['spc'] = [get_features(w) for w in nlp(e['ft'])]

anal1239.csv
apah1238.csv
arap1274.csv
bain1259.csv
beja1238.csv
bora1263.csv
cabe1245.csv
cash1254.csv
dolg1241.csv
even1259.csv
goem1240.csv
goro1270.csv
hoch1243.csv
jeha1242.csv
jeju1234.csv
kaka1265.csv
kama1351.csv
kark1256.csv
komn1238.csv
ligh1234.csv
lowe1385.csv
movi1243.csv
ngal1292.csv
nisv1234.csv
nngg1234.csv
nort2641.csv
nort2875.csv
orko1234.csv
pnar1238.csv
port1286.csv
resi1247.csv
ruul1235.csv
sadu1234.csv
sanz1248.csv
savo1255.csv
sout2856.csv
sout3282.csv
stan1290.csv
sumi1235.csv
svan1243.csv
taba1259.csv
teop1238.csv
texi1237.csv
trin1278.csv
tsim1256.csv
urum1249.csv
vera1241.csv
warl1254.csv
yong1270.csv
yuca1254.csv
yura1255.csv


In [62]:
pickle.dump(corpus, open('corpus_doreco.p', 'wb'))