# Seq2seq

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random

from IPython.display import display, HTML
import pylab as plt
import numpy as np

from pipeline import *

Using TensorFlow backend.


### Test with synthetic data

We generate some easy synthetic data

In [3]:
n_samples = 5000
n_timesteps_in = 5
n_timesteps_out = 30
n_features_in = 10
n_features_out = 26
train = False
weights_file = '../models/test_synthetic_weights.h5'

In [4]:
X, y = generate_sequences(n_samples, n_timesteps_in, n_timesteps_out, n_features_in, n_features_out, 
                          delete=0, multiply=1, permute=0)

In [5]:
encoded_X = encode(X)
encoded_y = encode(y)

vocab (11) ['PAD', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
vocab (13) ['PAD', '\t', '\n', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']


In [6]:
i = 0
for source, target in zip(encoded_X.int_encoded, encoded_y.int_encoded):
    print(source, '-->', target)
    i += 1
    if i > 20:
        break

[ 7 10  8 10  4] --> [ 1  9 12 10 12  6  2]
[7 5 4 3 8] --> [ 1  9  7  6  5 10  2]
[9 6 7 3 5] --> [ 1 11  8  9  5  7  2]
[ 8 10  1  4  5] --> [ 1 10 12  3  6  7  2]
[4 4 3 2 2] --> [1 6 6 5 4 4 2]
[3 8 5 4 4] --> [ 1  5 10  7  6  6  2]
[5 5 5 7 1] --> [1 7 7 7 9 3 2]
[ 8  4  1 10  9] --> [ 1 10  6  3 12 11  2]
[1 5 9 2 6] --> [ 1  3  7 11  4  8  2]
[9 3 8 1 6] --> [ 1 11  5 10  3  8  2]
[6 7 4 9 1] --> [ 1  8  9  6 11  3  2]
[ 2  1  6  8 10] --> [ 1  4  3  8 10 12  2]
[ 2  3  7  1 10] --> [ 1  4  5  9  3 12  2]
[8 5 6 5 9] --> [ 1 10  7  8  7 11  2]
[6 3 6 9 8] --> [ 1  8  5  8 11 10  2]
[ 3  3 10  2  5] --> [ 1  5  5 12  4  7  2]
[ 6 10  6  7  3] --> [ 1  8 12  8  9  5  2]
[6 6 8 8 4] --> [ 1  8  8 10 10  6  2]
[10  1  9 10  7] --> [ 1 12  3 11 12  9  2]
[5 9 4 7 8] --> [ 1  7 11  6  9 10  2]
[8 5 4 4 3] --> [ 1 10  7  6  6  5  2]


In [7]:
model_choice = 1
latent_dim = 32
batch_size = 32
epochs = 20

In [8]:
if train:
    model, additional = get_model(model_choice, latent_dim, 
                                  encoded_X.max_timesteps, encoded_y.max_timesteps, 
                                  encoded_X.max_features, encoded_y.max_features)    
    train_model(model, encoded_X, encoded_y, model_choice, 
                latent_dim, batch_size, epochs,
                encoded_X.max_timesteps, encoded_y.max_timesteps, 
                encoded_X.max_features, encoded_y.max_features)
    model.save_weights(weights_file)
else:
    model, additional = get_model(model_choice, latent_dim, 
                                  encoded_X.max_timesteps, encoded_y.max_timesteps, 
                                  encoded_X.max_features, encoded_y.max_features,
                                  weights_file=weights_file)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 11)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 13)     0                                            
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 11)     0           input_1[0][0]                    
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 13)     0           input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

Generate some new data to evaluate performance

In [9]:
n_samples = 10
X_new, y_new = generate_sequences(n_samples, n_timesteps_in, n_timesteps_out, n_features_in, n_features_out, 
                          delete=0, multiply=1, permute=0)

In [10]:
evaluate_model(X_new, y_new, encoded_X, encoded_y, model, model_choice, additional, sep='')

Query     "62081"
Actual    "GCAIB"
Predicted "GCAIB"
['G', 'C', 'A', 'I', 'B', '\n']

Query     "62217"
Actual    "GCCBH"
Predicted "GCCBH"
['G', 'C', 'C', 'B', 'H', '\n']

Query     "33394"
Actual    "DDDJE"
Predicted "DDDJE"
['D', 'D', 'D', 'J', 'E', '\n']

Query     "74464"
Actual    "HEEGE"
Predicted "HEEGE"
['H', 'E', 'E', 'G', 'E', '\n']

Query     "46984"
Actual    "EGJIE"
Predicted "EGJIE"
['E', 'G', 'J', 'I', 'E', '\n']

Query     "53650"
Actual    "FDGFA"
Predicted "FDGFA"
['F', 'D', 'G', 'F', 'A', '\n']

Query     "52002"
Actual    "FCAAC"
Predicted "FCAAC"
['F', 'C', 'A', 'A', 'C', '\n']

Query     "64489"
Actual    "GEEIJ"
Predicted "GEEIJ"
['G', 'E', 'E', 'I', 'J', '\n']

Query     "13404"
Actual    "BDEAE"
Predicted "BDEAE"
['B', 'D', 'E', 'A', 'E', '\n']

Query     "24356"
Actual    "CEDFG"
Predicted "CEDFG"
['C', 'E', 'D', 'F', 'G', '\n']

correct 10


Also check on some training data

In [11]:
nums = [x for x in range(len(X))]
random.shuffle(nums)
X_new = []
y_new = []
for i in nums[0:20]:
    X_new.append(X[i])
    y_new.append(y[i])
X_new = np.array(X_new)
y_new = np.array(y_new)

In [12]:
evaluate_model(X_new, y_new, encoded_X, encoded_y, model, model_choice, additional, sep='')

Query     "01815"
Actual    "ABIBF"
Predicted "ABIBF"
['A', 'B', 'I', 'B', 'F', '\n']

Query     "02690"
Actual    "ACGJA"
Predicted "ACGJA"
['A', 'C', 'G', 'J', 'A', '\n']

Query     "48367"
Actual    "EIDGH"
Predicted "EIDGH"
['E', 'I', 'D', 'G', 'H', '\n']

Query     "16840"
Actual    "BGIEA"
Predicted "BGIEA"
['B', 'G', 'I', 'E', 'A', '\n']

Query     "41655"
Actual    "EBGFF"
Predicted "EBGFF"
['E', 'B', 'G', 'F', 'F', '\n']

Query     "64550"
Actual    "GEFFA"
Predicted "GEFFA"
['G', 'E', 'F', 'F', 'A', '\n']

Query     "29139"
Actual    "CJBDJ"
Predicted "CJBDJ"
['C', 'J', 'B', 'D', 'J', '\n']

Query     "80006"
Actual    "IAAAG"
Predicted "IAAAG"
['I', 'A', 'A', 'A', 'G', '\n']

Query     "63432"
Actual    "GDEDC"
Predicted "GDEDC"
['G', 'D', 'E', 'D', 'C', '\n']

Query     "48771"
Actual    "EIHHB"
Predicted "EIHHB"
['E', 'I', 'H', 'H', 'B', '\n']

Query     "81164"
Actual    "IBBGE"
Predicted "IBBGE"
['I', 'B', 'B', 'G', 'E', '\n']

Query     "25468"
Actual    "CFEGI"
Predict