In [3]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from Sequences import SequenceEncoding, SequenceDecoding
from ModelArgs import ModelArgs
from TranslationModel import TranslationModel
from keras.models import load_model

## Data Loading Sandbox

In [17]:
# Total Data Length = 150,000 sequence pairs (per data source)
def load_data(path:str):
    text = []
    with open(path,mode='r',encoding='utf-8') as txt_file:
        for i,line in enumerate(txt_file):
            text.append(line)
            if i>100:
                break
    return np.asarray(text)

In [18]:
data = load_data('fra.txt')
print('Text Shape:{}'.format(data.shape))
print('Text Subset:\n{}'.format(data[:10]))

Text Shape:(102,)
Text Subset:
['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)\n'
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)\n'
 'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)\n'
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)\n'
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)\n'
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)\n'
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)\n'
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)\n'
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)\n'
 'Run!\tFile !\tCC-BY 2

### Observations
1. Data is 150,000 in length, likely don't want to use all of it, for time & computational purposes
2. Data is structured in pairs of sequences separated by tabs (\t)
3. Data has punctuation (.,!,etc.), needs to be removed because the model doesn't like special characters
4. Data should be lower cased for similar reasons
5. We only need the Fre-Eng sequences so we can get rid of any other data

In [None]:
# Test Cleaner (V1)
seq = text[0].split('\t')
seq = '|'.join(seq[:1] + seq[1:2])
seq = re.sub(r"[^a-zA-Z|]",'',seq)
seq = seq.lower().split('|')
print(seq)

In [41]:
# Alternative: Avoids joining and re-splitting string by special character
seq = text[0].strip().split('\t')
eng = re.sub(r"[^a-zA-Z]",'',seq[0])
fre = re.sub(r"[^a-zA-Z]",'',seq[1])
seq = [eng.lower(),fre.lower()]
seq

['go', 'va']

## Sequence Data Analysis

In [6]:
path = 'fra.txt'
encoding_obj = SequenceEncoding(path)
encoding_obj.load_text()
vocab_metadata = encoding_obj.get_vocab_metadata()
x_train, x_test, y_train, y_test = encoding_obj.process_data(vocab_metadata)

208906
Max IN: 139, Max OUT: 139


In [15]:
vocab_metadata

{'max_in_length': 139,
 'in_vocab_size': 16164,
 'in_tok': <keras.preprocessing.text.Tokenizer at 0x243b9f0d040>,
 'out_vocab_size': 30905,
 'max_out_length': 139,
 'out_tok': <keras.preprocessing.text.Tokenizer at 0x243b9f0d310>}

In [16]:
display(x_train.shape, y_train.shape)
display(x_train,y_train)

(167124, 139)

(167124, 139)

array([[  11,  122,   57, ...,    0,    0,    0],
       [  96,   22,   19, ...,    0,    0,    0],
       [   7,  670,    3, ...,    0,    0,    0],
       ...,
       [  22,    2,  407, ...,    0,    0,    0],
       [   1,  207,    7, ...,    0,    0,    0],
       [   4,  376, 8762, ...,    0,    0,    0]])

array([[  11,    5,   34, ...,    0,    0,    0],
       [  71,   44,   99, ...,    0,    0,    0],
       [  10,    8,  459, ...,    0,    0,    0],
       ...,
       [  19,  172,   31, ...,    0,    0,    0],
       [ 233,    4,   10, ...,    0,    0,    0],
       [   7, 1095,    8, ...,    0,    0,    0]])

In [11]:
display(x_train.shape, y_train.shape)
display(x_train,y_train)

(40000, 8)

(40000, 23)

array([[   2,   26,   16, ...,    0,    0,    0],
       [   1, 1495, 1191, ...,    0,    0,    0],
       [   1,   44,  643, ...,    0,    0,    0],
       ...,
       [ 136,    4, 1711, ...,    0,    0,    0],
       [  23,    6,   32, ...,    0,    0,    0],
       [  18,    3,  259, ...,    0,    0,    0]], dtype=int32)

array([[   1,   55,   48, ...,    0,    0,    0],
       [   2,  738,  525, ...,    0,    0,    0],
       [   2,   10,  234, ...,    0,    0,    0],
       ...,
       [  99,    3, 2610, ...,    0,    0,    0],
       [  22,    7,  925, ...,    0,    0,    0],
       [  43,    8,   56, ...,    0,    0,    0]], dtype=int32)

In [21]:
optimizer, loss = 'rmsprop', 'sparse_categorical_crossentropy'
epochs, batch_size = 5, 1_000
args = ModelArgs(optimizer,loss,epochs,batch_size,vocab_metadata['in_vocab_size'],
vocab_metadata['out_vocab_size'],vocab_metadata['max_in_length'],vocab_metadata['max_out_length'], validation_split=0.15)

In [27]:
nmt_obj = TranslationModel(args, x_train,y_train)
model = nmt_obj.create_model()

In [29]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 139, 32)           506816    
                                                                 
 lstm_4 (LSTM)               (None, 32)                8320      
                                                                 
 repeat_vector_2 (RepeatVect  (None, 139, 32)          0         
 or)                                                             
                                                                 
 lstm_5 (LSTM)               (None, 139, 32)           8320      
                                                                 
 dense_2 (Dense)             (None, 139, 30224)        997392    
                                                                 
Total params: 1,520,848
Trainable params: 1,520,848
Non-trainable params: 0
____________________________________________

In [7]:
def main(path: str, batch_size: int, epochs: int, line_limit = None):
    encoding_obj = SequenceEncoding(path, line_limit= line_limit)
    encoding_obj.load_text()
    vocab_metadata = encoding_obj.get_vocab_metadata()
    x_train, x_test, y_train, y_test = encoding_obj.process_data(vocab_metadata, output_tokenizers=True)

    optimizer, loss = 'rmsprop', 'sparse_categorical_crossentropy'
    args = ModelArgs(optimizer,loss,epochs,batch_size,vocab_metadata['in_vocab_size'],
    vocab_metadata['out_vocab_size'],vocab_metadata['max_in_length'],vocab_metadata['max_out_length'], validation_split=0.15)
    
    nmt_obj = TranslationModel(args, x_train, y_train)
    model = nmt_obj.create_model()
    history = nmt_obj.train(model)
    return history, x_test, y_test

In [8]:
history, x_test, y_test = main('fra.txt', batch_size=100, epochs=10, line_limit=50_000)

Max IN: 8, Max OUT: 27
Epoch 1/10

Epoch 1: val_loss improved from inf to 1.05798, saving model to best_model.keras
340/340 - 65s - loss: 2.5545 - val_loss: 1.0580 - 65s/epoch - 191ms/step
Epoch 2/10

Epoch 2: val_loss improved from 1.05798 to 0.97803, saving model to best_model.keras
340/340 - 58s - loss: 1.0106 - val_loss: 0.9780 - 58s/epoch - 171ms/step
Epoch 3/10

Epoch 3: val_loss improved from 0.97803 to 0.94922, saving model to best_model.keras
340/340 - 59s - loss: 0.9581 - val_loss: 0.9492 - 59s/epoch - 173ms/step
Epoch 4/10

Epoch 4: val_loss improved from 0.94922 to 0.93347, saving model to best_model.keras
340/340 - 59s - loss: 0.9332 - val_loss: 0.9335 - 59s/epoch - 175ms/step
Epoch 5/10

Epoch 5: val_loss improved from 0.93347 to 0.91635, saving model to best_model.keras
340/340 - 59s - loss: 0.9142 - val_loss: 0.9164 - 59s/epoch - 175ms/step
Epoch 6/10

Epoch 6: val_loss improved from 0.91635 to 0.90218, saving model to best_model.keras
340/340 - 59s - loss: 0.8976 - val

In [10]:
seq_dec = SequenceDecoding('best_model.keras',)

<keras.callbacks.History at 0x243296f21c0>

In [11]:
model = load_model('best_model.keras')

In [13]:
model.pred

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 32)             108640    
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 repeat_vector (RepeatVector  (None, 27, 32)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 27, 32)            8320      
                                                                 
 dense (Dense)               (None, 27, 7070)          233310    
                                                                 
Total params: 358,590
Trainable params: 358,590
Non-trainable params: 0
__________________________________________________