# Sequence Data Analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [79]:
import pandas as pd
import numpy as np
import re
from file import EncodeSequences
from sklearn.model_selection import train_test_split

In [80]:
obj = EncodeSequences('fra.txt')
text = obj.load_text()
train,test = train_test_split(text,test_size=.15,random_state=4)

In [72]:
in_lang_lengths,out_lang_lengths = obj._get_sentence_lengths(text)

In [74]:
max(out_lang_lengths)

14

In [37]:
'dD DD'+ 'kjkj'

'dD DDkjkj'

In [81]:
text

array([['go', 'va'],
       ['go', 'marche'],
       ['go', 'en route'],
       ...,
       ['i need to keep moving', 'je dois continuer  avancer'],
       ['i need to make a copy', 'je dois faire une copie'],
       ['i need to make my bed', 'il faut que je fasse mon lit']],
      dtype='<U68')

In [83]:
test

array([['drink more water', 'bois davantage deau'],
       ['theyre boys', 'ce sont des garons'],
       ['youre not safe here', 'vous ntes pas en scurit ici'],
       ...,
       ['i liked your story', 'jai bien aim ton histoire'],
       ['i like swimming', 'jaime nager'],
       ['come alone', 'viens seul']], dtype='<U68')

In [30]:
# Total Data Length = 150,000 sequence pairs (per data source)
def load_data(path:str):
    text = []
    with open(path,mode='r',encoding='utf-8') as txt_file:
        for i,line in enumerate(txt_file):
            text.append(line)
            if i>100:
                break
    return np.asarray(text)

In [31]:
data = load_data('fra.txt')
print('Text Shape:{}'.format(data.shape))
print('Text Subset:\n{}'.format(data[:10]))

Text Shape:(102,)
Text Subset:
['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)\n'
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)\n'
 'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)\n'
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)\n'
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)\n'
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)\n'
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)\n'
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)\n'
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)\n'
 'Run!\tFile !\tCC-BY 2

### Observations
1. Data is 150,000 in length, likely don't want to use all of it, for time & computational purposes
2. Data is structured in pairs of sequences separated by tabs (\t)
3. Data has punctuation (.,!,etc.), needs to be removed because the model doesn't like special characters
4. Data should be lower cased for similar reasons
5. We only need the Fre-Eng sequences so we can get rid of any other data

In [35]:
# Test Cleaner (V1)
seq = text[0].split('\t')
seq = '|'.join(seq[:1] + seq[1:2])
seq = re.sub(r"[^a-zA-Z|]",'',seq)
seq = seq.lower().split('|')
print(seq)

['go', 'va']


In [41]:
# Alternative: Avoids joining and re-splitting string by special character
seq = text[0].strip().split('\t')
eng = re.sub(r"[^a-zA-Z]",'',seq[0])
fre = re.sub(r"[^a-zA-Z]",'',seq[1])
seq = [eng.lower(),fre.lower()]
seq

['go', 'va']