In [1]:
%load_ext autoreload
%autoreload 2

In [19]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from SequenceEncoding import SequenceEncoding
from ModelArgs import ModelArgs
from TranslationModel import TranslationModel

## Data Loading Sandbox

In [30]:
# Total Data Length = 150,000 sequence pairs (per data source)
def load_data(path:str):
    text = []
    with open(path,mode='r',encoding='utf-8') as txt_file:
        for i,line in enumerate(txt_file):
            text.append(line)
            if i>100:
                break
    return np.asarray(text)

In [31]:
data = load_data('fra.txt')
print('Text Shape:{}'.format(data.shape))
print('Text Subset:\n{}'.format(data[:10]))

Text Shape:(102,)
Text Subset:
['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)\n'
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)\n'
 'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)\n'
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)\n'
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)\n'
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)\n'
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)\n'
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)\n'
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)\n'
 'Run!\tFile !\tCC-BY 2

### Observations
1. Data is 150,000 in length, likely don't want to use all of it, for time & computational purposes
2. Data is structured in pairs of sequences separated by tabs (\t)
3. Data has punctuation (.,!,etc.), needs to be removed because the model doesn't like special characters
4. Data should be lower cased for similar reasons
5. We only need the Fre-Eng sequences so we can get rid of any other data

In [35]:
# Test Cleaner (V1)
seq = text[0].split('\t')
seq = '|'.join(seq[:1] + seq[1:2])
seq = re.sub(r"[^a-zA-Z|]",'',seq)
seq = seq.lower().split('|')
print(seq)

['go', 'va']


In [41]:
# Alternative: Avoids joining and re-splitting string by special character
seq = text[0].strip().split('\t')
eng = re.sub(r"[^a-zA-Z]",'',seq[0])
fre = re.sub(r"[^a-zA-Z]",'',seq[1])
seq = [eng.lower(),fre.lower()]
seq

['go', 'va']

## Sequence Data Analysis

In [16]:
path = 'fra.txt'
encoding_obj = SequenceEncoding(path)
encoding_obj.load_text()
vocab_metadata = encoding_obj.get_vocab_metadata()
x_train, x_test, y_train, y_test = encoding_obj.process_data(vocab_metadata)

Max IN: 139, Max OUT: 139


In [17]:
vocab_metadata

{'max_in_length': 139,
 'in_vocab_size': 15838,
 'in_tok': <keras.preprocessing.text.Tokenizer at 0x7f88a1456a00>,
 'out_vocab_size': 30224,
 'max_out_length': 139,
 'out_tok': <keras.preprocessing.text.Tokenizer at 0x7f88a1456ee0>}

In [11]:
display(x_train.shape, y_train.shape)
display(x_train,y_train)

(40000, 8)

(40000, 23)

array([[   2,   26,   16, ...,    0,    0,    0],
       [   1, 1495, 1191, ...,    0,    0,    0],
       [   1,   44,  643, ...,    0,    0,    0],
       ...,
       [ 136,    4, 1711, ...,    0,    0,    0],
       [  23,    6,   32, ...,    0,    0,    0],
       [  18,    3,  259, ...,    0,    0,    0]], dtype=int32)

array([[   1,   55,   48, ...,    0,    0,    0],
       [   2,  738,  525, ...,    0,    0,    0],
       [   2,   10,  234, ...,    0,    0,    0],
       ...,
       [  99,    3, 2610, ...,    0,    0,    0],
       [  22,    7,  925, ...,    0,    0,    0],
       [  43,    8,   56, ...,    0,    0,    0]], dtype=int32)

In [21]:
optimizer, loss = 'rmsprop', 'sparse_categorical_crossentropy'
epochs, batch_size = 5, 1_000
args = ModelArgs(optimizer,loss,epochs,batch_size,vocab_metadata['in_vocab_size'],
vocab_metadata['out_vocab_size'],vocab_metadata['max_in_length'],vocab_metadata['max_out_length'], validation_split=0.15)

In [27]:
nmt_obj = TranslationModel(args, x_train,y_train)
model = nmt_obj.create_model()

In [29]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 139, 32)           506816    
                                                                 
 lstm_4 (LSTM)               (None, 32)                8320      
                                                                 
 repeat_vector_2 (RepeatVect  (None, 139, 32)          0         
 or)                                                             
                                                                 
 lstm_5 (LSTM)               (None, 139, 32)           8320      
                                                                 
 dense_2 (Dense)             (None, 139, 30224)        997392    
                                                                 
Total params: 1,520,848
Trainable params: 1,520,848
Non-trainable params: 0
____________________________________________

In [30]:
trained_model = nmt_obj.train(model)

Epoch 1/5


: 

: 