Let's try to set up a simple alignment between the orthography of the gold standard Bribri data we have and an external dataset we found. It's from https://github.com/rolandocoto/bribri-coling2020

In [1]:
import pandas as pd

In [102]:
coling_data = pd.read_csv("../data/extra_monolingual/spa-bribri-coling2020.txt", sep=';')

In [103]:
coling_data.iloc[2]

Order                                                                  1224
Bribri (Nasal as line)                            pà mâ̱tk wö́kir mâ̱tk
Training sentence                                     pà mâxtk wóqkir mâxtk
Spanish                                            lapa roja de cabeza roja
Source                    CorpusSofia B14h54m43s24oct2015.html Sobre la ...
Dialect                                                              amubri
Name: 2, dtype: object

In [104]:
coling_data['Dialect'].unique()

array(['amubri', 'salitre', 'coroma'], dtype=object)

In [105]:
# we only care about the training text
coling_data_bribri = coling_data['Training sentence']

In [25]:
all_text_coling = " ".join(coling_data_bribri)

In [14]:
# Load the gold standard data

gold_data = pd.read_csv("../data/yoyodyne/bribri-train.tsv", sep='\t')

In [17]:
all_text_gold = " ".join(gold_data.iloc[:,0] + gold_data.iloc[:,1])

In [21]:
gold_character_set = set([x for x in all_text_gold.lower()])

In [22]:
gold_character_set

{' ',
 '!',
 "'",
 'a',
 'b',
 'c',
 'd',
 'e',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'w',
 'y',
 'à',
 'á',
 'â',
 'è',
 'é',
 'ë',
 'ì',
 'í',
 'ñ',
 'ò',
 'ó',
 'ö',
 'ù',
 'ú',
 '̀',
 '́',
 '̠'}

In [26]:
character_set_coling = set([x for x in all_text_coling.lower()])

In [84]:
only_in_coling = character_set_coling - gold_character_set
only_in_gold = gold_character_set - character_set_coling

In [85]:
only_in_gold

{'́', '̠'}

In [29]:
only_in_coling

{'"',
 '*',
 ',',
 '.',
 '0',
 ':',
 '?',
 '[',
 ']',
 'g',
 'q',
 'v',
 'x',
 'z',
 'ã',
 'ê',
 'î',
 'ô',
 'õ',
 'û',
 'ĩ',
 'ũ',
 '̈',
 '̱',
 'ẽ'}

In [30]:
only_in_gold =  gold_character_set - character_set_coling

In [31]:
only_in_gold

{'́', '̠'}

In [32]:
def build_ngrams(full_text, n=3):
    n_grams = []
    split_text = full_text.split()
    for word in split_text:
        if len(word) >= n:
            n_grams.extend([word[i:i+n] for i in range(len(word) - n + 1)])
    
    return n_grams
        

In [33]:
all_text_coling = all_text_coling.lower()
all_text_gold = all_text_gold.lower()

In [34]:
coling_ngrams = build_ngrams(all_text_coling)

In [35]:
gold_ngrams = build_ngrams(all_text_gold)

In [36]:
from collections import Counter

In [38]:
gold_count_ngrams = Counter(gold_ngrams)
coling_count_ngrams = Counter(coling_ngrams)

In [51]:
most_common_gold = [x[0] for x in gold_count_ngrams.most_common(50)]
most_common_coling = [x[0] for x in coling_count_ngrams.most_common(1000)]

In [52]:
only_gold = set(most_common_gold) - set(most_common_coling)

In [53]:
len(only_gold)

25

In [54]:
only_gold

{"'be",
 "'rë",
 "'ye",
 "bi'",
 "e̠'",
 'ku̠',
 'kàs',
 'kë̀',
 "kö'",
 'kö̀',
 'kö́',
 'mi̠',
 'ne̠',
 "o't",
 "po'",
 'ros',
 'skö',
 'sík',
 'ta̠',
 'tó̠',
 'wa̠',
 'àrr',
 'è̠k',
 'ñè̠',
 'ö́k'}

In [55]:
all_text_gold = all_text_gold.replace(".","")
all_text_coling = all_text_coling.replace(".","")

In [80]:
ye_words = list(set([word for word in all_text_gold.split(" ") if 'kím' in word]))

In [81]:
ye_words

['kímu̠k', "kíme̠'i", "kíme̠'ye'", 'kímè̠ke']

In [82]:
shka_words = [word for word in all_text_coling.split(" ") if "kím" in word]

In [83]:
sorted(list(set(shka_words)))

['kímèxkẽ', 'kímũk', 'kímũkdak']

In [90]:
def contains_foreign_character(text):
    for char in text:
        if char not in gold_character_set:
            return True
        
    return False

def not_foreign(text):
    return not contains_foreign_character(text)

In [106]:
import regex
def remove_non_alphabetic(row):
    text = row['Training sentence']
    row['Training sentence'] =  regex.sub(r'\P{L}+', ' ', text)
    return row

In [107]:
coling_data = coling_data.apply(remove_non_alphabetic, axis=1)

In [116]:
coling_data_train = coling_data['Training sentence']

In [122]:
all_coling_train_words = " ".join(coling_data_train).lower().split(" ")

In [127]:
words_of_interest = list(set([word for word in all_coling_train_words if 'ñé' in word]))
words_of_interest

['ñéx', 'ñéxse', 'ñéxjkë', 'ñéxes', 'ñéxe', 'ñéxẽ', 'nãñéxwe']

In [None]:
# ã > a̠?

In [112]:
coling_data_filtered = coling_data[coling_data['Bribri (Nasal as line)'].apply(not_foreign)]

In [113]:
coling_data_filtered['Dialect'].unique()

array(['amubri', 'salitre', 'coroma'], dtype=object)

In [114]:
len(coling_data_filtered)

54

In [111]:
coling_data_filtered

Unnamed: 0,Order,Bribri (Nasal as line),Training sentence,Spanish,Source,Dialect
43,1265,"chkè kuá, dikó kuá, tsiru' kuá, kápi kua...",chkè kuá dikó kuá tsiru kuá kápi kuá chkè kuá,"plantas comestibles , pejibaye , cacao , café...",CorpusSofia B15h53m37s18jul2014.html Sobre el ...,salitre
61,1283,i' dör sa' ùsulë,i dör sa ùsulë,esta es nuestra casa cónica,CorpusSofia B17h34m15s06apr2012.html Sobre el ...,amubri
78,1300,e' dör,e dör,eso es,CorpusSofia B17h34m15s06apr2012.html Sobre el ...,amubri
140,1362,"té kókó ""entonces ya""",té kókó entonces ya,"cortado "" kó kó "" entonces ya",CorpusSofia B10h27m47s25oct2015.html Sobre la ...,amubri
157,1379,e's,e s,así,CorpusSofia B10h27m47s25oct2015.html Sobre la ...,amubri
...,...,...,...,...,...,...
1623,2845,"e', e' tsikír",e e tsikír,"eso , eso se está cosechando",CorpusSofia B12h41m01s14jul2016.html Conversac...,amubri
1632,2854,e's,e s,así,CorpusSofia B12h41m01s14jul2016.html Conversac...,amubri
1636,2858,wé,wé,molido,CorpusSofia B22h25m38s07apr2012.html Conversac...,amubri
1650,2872,"ya yé, yétke",ya yé yétke,"ya se toma , ya se toma",CorpusSofia B22h25m38s07apr2012.html Conversac...,amubri


In [129]:
# what about the other shared task data?

bribri_st2_data = []
with open("../data/extra_monolingual/train.bzd", encoding='utf-8') as infile:
    bribri_st2_data = infile.readlines()

In [130]:
bribri_st2_data[:10]

["Ye' shkèxnã bua'ë.\n",
 "Ye' dör bikâkala.\n",
 "Ìs a' shkèxnã?\n",
 "Sa' shkèxnã bua'ë.\n",
 "Ìs be' shkèxnã?\n",
 "Ye' shkèxnã bua'ë.\n",
 "Ye' dör Juan.\n",
 "Íxmã be' kie?\n",
 "Ye' kie Juan.\n",
 'Àxã!\n']

In [131]:
len(bribri_st2_data)

7508

In [137]:
# Seems pretty reasonable, but let's at least see what might be out-of-domain for the alphabet
bribri_st2_data = [x.strip().lower() for x in bribri_st2_data]
cleaned_bribri_st2 = [x for x in bribri_st2_data if not contains_foreign_character(x)]
len(cleaned_bribri_st2)

557

In [135]:
cleaned_bribri_st2[:10]

["ye' ù",
 'aláköl ù',
 'dìwö kúl',
 "yi ya'",
 'ya ù',
 'awá kàl',
 'awá kél',
 'dalì we',
 "stsa'wö we",
 'dakarò we']

In [None]:
# What characters are in this set that aren't present in our gold standard data?

all_bribri_st2 = " ".join(bribri_st2_data)
foreign_chars = [x for x in all_bribri_st2 if x not in gold_ch]