In [1]:
import numpy as np
from collections import defaultdict
import random
import pandas as pd

In [2]:
path_to_data = '/home/hamza/3A_projets/data_all/glose/sentences.csv'
df_sents = pd.read_csv(path_to_data,sep='\t',header=None)


In [46]:
df_sent_en = df_sents[df_sents[1]=='eng'] ## select english sentences
df_sent_en = df_sent_en[2].to_frame()
df_sent_en.rename(columns={2:'sentence'},inplace=True)
df_sent_en = df_sent_en.sample(200000)  ## we take a subset of 100 000 sentences to have a trainable model

Many examples of tatoeba are in fact composed by more than one sentence. so we will use spacy sentence segmentation to parse them : this is relevant since spacy is good especially when the separators are clear (dots , two points,etc...). The model that I will implement will be especially relevant to separate sentences that do not contain these separators. 

In [49]:
import spacy
from tqdm import tqdm
#!python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
k=0
all_sentences = df_sent_en.sentence.values
sentences = []
for text in tqdm(all_sentences,total = len(all_sentences)) :
    doc = nlp(text)
    if 'and' in text.split(): 
        p = random.random()
        if p<0.15 : 
            sentences.append(text) ## we take 15% of examples that contain and 
    if len(list(doc.sents))<=1 :
        sentences.append(text)
    else : 
        for sentence in doc.sents :
            sentences.append(sentence.text)


100%|██████████| 200000/200000 [25:27<00:00, 130.90it/s]


In [50]:
#import pickle
#pickle.dump(sentences,open('data/sentences.pkl','wb'))

# Tokenize 

In [52]:
import nltk
df_sentences = pd.DataFrame(data={'sentence':sentences})
df_sentences['tokenized'] = df_sentences.sentence.apply(lambda sent : nltk.word_tokenize(sent))

The dot ends the majority of the sentences. We will change this by randomly removing dots , interrogation points. We also add randomly commas , two points , 'and' connector ...

In [53]:
## constitue new sentences 
change_sents = []
tokenized_sents = list(df_sentences['tokenized'].values)
probs = {'.':0.85, '?':0.7 , '!' : 0.35}
conjunction = {'and':0.3 , ',' :0.3,';':0.2,':':0.2}
n = len(tokenized_sents)
for j,sent in enumerate(tokenized_sents)  :
    if sent[-1] in ['.','?','!'] : 
        k = np.random.binomial(size=1, n=1, p= probs[sent[-1]])
        if k == 0 : 
            change_sents.append(sent)
        else : 
            if len(sent[:-1])==0 : 
                continue
            else :
                q = np.random.binomial(size=1, n=1, p=0.4)
                if q == 0 :
                    change_sents.append(sent[:-1])
                elif q==1 :
                    p = random.random()
                    if p < conjunction['and'] : 
                        if j != n-1 : 
                            tokenized_sents[j+1].insert(0,'and')
                            change_sents.append(sent[:-1])
                    elif conjunction['and']<=p < conjunction['and'] + conjunction[','] :
                        sent=sent[:-1]+[',']
                        change_sents.append(sent)
                    elif conjunction['and'] + conjunction[',']<=p < conjunction['and'] + conjunction[','] + conjunction[':']:
                        sent=sent[:-1]+[':']
                        change_sents.append(sent)
                    elif conjunction['and'] + conjunction[','] + conjunction[':'] <=p :
                        sent=sent[:-1]+[';']
                        change_sents.append(sent)
                        
                    
    else :
        change_sents.append(sent)

In [54]:
## see distribution of last elements
final_chars = defaultdict(int)
for sent in change_sents:
    final_chars[sent[-1]]+=1
dict(final_chars) 

{',': 20368,
 'lawyer': 50,
 'thinking': 29,
 ':': 13908,
 'anymore': 433,
 'U.S': 3,
 ';': 13559,
 'dramatically': 3,
 '.': 27062,
 'well': 420,
 'Islamophobic': 3,
 'today': 739,
 'attended': 3,
 'delivered': 3,
 'Australia': 554,
 'taxi': 18,
 'that': 5960,
 'them': 732,
 'bad': 74,
 'stay': 63,
 'required': 5,
 'boss': 33,
 'company': 67,
 'suit': 16,
 'floor': 66,
 'register': 8,
 'up': 431,
 'diner': 1,
 'mad': 59,
 'weather': 34,
 'coins': 10,
 'adopt': 1,
 'sentence': 38,
 'home': 522,
 'blaze': 1,
 'skate': 3,
 'border': 7,
 'you': 2123,
 'now': 853,
 'fainted': 3,
 'Tatoeba': 28,
 '?': 7016,
 'choice': 74,
 'future': 64,
 'wine': 77,
 'me': 1841,
 'mouth': 32,
 'married': 148,
 'teachers': 26,
 'meeting': 119,
 'disappointed': 37,
 'Yoshkar-Ola': 2,
 'wrong': 224,
 'Tom': 1666,
 'great': 39,
 'fast': 64,
 'secrets': 10,
 'morning': 277,
 'computer': 61,
 'hometown': 5,
 'alike': 9,
 'mosque': 58,
 'assignment': 3,
 '2': 3,
 'says': 43,
 'failure': 26,
 'try': 44,
 'ones': 23,

## Create labels 

In [57]:
tab_labels = []
for sent in change_sents :
    labels = [0] * len(sent)
    labels[-1] = 1 
    tab_labels.append(labels)

## concatenante randomly sentences

In [58]:
K = random.randrange(1,6) ## we choose uptil 6 sentences to concatenante

grouped_sent , grouped_labels = []  , [] 
p=0
for i, (sent,label) in enumerate(zip(change_sents,tab_labels),1) :
    if len(sent)> 400 : 
        if i%K ==0 :
            p+=1
            K = random.randrange(1,6)
        continue
    try : 
        grouped_sent[p]+=sent
        grouped_labels[p]+=label
    except : 
        grouped_sent.append(sent)
        grouped_labels.append(label)
    if i%K ==0 :
        p+=1
        K = random.randrange(1,6)


In [60]:
len(grouped_labels)

94381

In [61]:
import pickle 
pickle.dump(grouped_sent, open('/home/hamza/3A_projets/data_all/glose/tatoeba_sentences.pkl','wb'))
pickle.dump(grouped_labels, open('/home/hamza/3A_projets/data_all/glose/tatoeba_grouped_labels.pkl','wb'))

In [42]:
max_ = 0
for k,sent in enumerate(grouped_sent) : 
    if k==3644 : 
        print(sent)
    if len(sent)>max_ :
        max_=len(sent)

['Is', 'she', 'the', 'lady', 'who', 'was', 'dressed', 'in', 'blue', '?', 'At', 'nightfall', ',', 'when', 'the', 'last', 'rays', 'of', 'the', 'sun', 'color', 'the', 'prairie', 'sky', 'red', ',', 'the', 'howls', 'of', 'coyotes', 'resonate', 'through', 'the', 'hills', '.', 'I', 'just', 'want', 'to', 'see', 'if', 'I', 'can', 'do', 'that']


In [39]:
len(grouped_sent)

89102