In [1]:
import re
import string
from unicodedata import normalize
import numpy
import random

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [2]:
# e.g., filename = 'Data/deu.txt'
filename = 'fra.txt'

# e.g., n_train = 20000
train_val_percentage = 0.9
train_val_split = 0.8
test_percentage = 1 - train_val_percentage
print("Train val / Test split is: ",train_val_percentage, " and ", test_percentage)

Train val / Test split is:  0.9  and  0.09999999999999998


In [3]:
# French dataset
# load dataset
doc = load_doc(filename)

# split into Language1-Language2 pairs
pairs = to_pairs(doc)

# clean sentences
clean_pairs = clean_data(pairs)[0:int(len(pairs)*train_val_percentage), :]
clean_pairs_test = clean_data(pairs)[int(len(pairs)*train_val_percentage):, :]
test_indices = list(range(len(clean_pairs_test)))

In [4]:
for i in range(3000, 3010):
    print('[' + clean_pairs[i, 0] + '] => [' + clean_pairs[i, 1] + ']')

[were smart] => [nous sommes intelligentes]
[were sorry] => [nous sommes desoles]
[were sorry] => [nous sommes desolees]
[were stuck] => [nous sommes coinces]
[were stuck] => [nous sommes coincees]
[were tired] => [nous sommes fatigues]
[were tired] => [nous sommes fatiguees]
[were twins] => [nous sommes jumeaux]
[were twins] => [nous sommes jumelles]
[what a bore] => [quel emmerdeur]


In [5]:
train_val_indices = list(range(len(clean_pairs)))
numpy.random.shuffle(train_val_indices)
train_indices = train_val_indices[:int(len(clean_pairs)*train_val_split)]
val_indices = train_val_indices[int(len(clean_pairs)*train_val_split):]

print(len(train_val_indices), len(train_indices), len(val_indices))

158060 126448 31612


In [6]:
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (158060,)
Length of target_texts: (158060,)


In [7]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 45
max length of target sentences: 95


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, 
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))


Using TensorFlow backend.


shape of encoder_input_seq: (158060, 45)
shape of input_token_index: 27
shape of decoder_input_seq: (158060, 95)
shape of target_token_index: 29


In [9]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


In [10]:
from keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

In [11]:
import numpy as np
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, indices, encoder_input_seq, decoder_input_seq, num_encoder_tokens, 
                 num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length,
                 batch_size=512, shuffle=True):
        
        self.encoder_input_seq = encoder_input_seq
        self.decoder_input_seq = decoder_input_seq
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.max_encoder_seq_length = max_encoder_seq_length
        self.max_decoder_seq_length = max_decoder_seq_length
        self.batch_size = batch_size
        self.list_IDs = indices
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        #print(self.list_IDs)
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.list_IDs[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        if self.shuffle == True:
            np.random.shuffle(self.list_IDs)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)

        batch_encoder_input_seq = []
        batch_decoder_input_seq = []
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            batch_encoder_input_seq.append(self.encoder_input_seq[ID])
            batch_decoder_input_seq.append(self.decoder_input_seq[ID])
            
        encoder_input_data = onehot_encode(numpy.stack(batch_encoder_input_seq), self.max_encoder_seq_length, self.num_encoder_tokens)
        decoder_input_data = onehot_encode(numpy.stack(batch_decoder_input_seq), self.max_decoder_seq_length, self.num_decoder_tokens)
        
        decoder_target_seq = numpy.zeros(numpy.stack(batch_decoder_input_seq).shape)
        decoder_target_seq[:, 0:-1] = numpy.stack(batch_decoder_input_seq)[:, 1:]
        decoder_target_data = onehot_encode(decoder_target_seq, 
                                            self.max_decoder_seq_length, 
                                            self.num_decoder_tokens)
            
        
        return [encoder_input_data,decoder_input_data], [decoder_target_data]


In [12]:
train_generator = DataGenerator(train_indices, encoder_input_seq, decoder_input_seq, num_encoder_tokens, 
                 num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length)

val_generator = DataGenerator(val_indices, encoder_input_seq, decoder_input_seq, num_encoder_tokens, 
                 num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length)

In [13]:
from keras.layers import Input, LSTM, Bidirectional, Concatenate
from keras.models import Model

latent_dim = 128

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')

encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
                                  dropout=0.4, name='encoder_lstm'))
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[state_h, state_c],
                      name='encoder')

In [14]:
encoder_model.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 28)     0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 256), (None, 160768      encoder_inputs[0][0]             
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 256)          0           bidirectional_1[0][1]            
                                                                 bidirectional_1[0][3]            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 256)          0           bidirectional_1[0][2]      

In [15]:
from keras.layers import Input, LSTM, Dense, Dropout
from keras.models import Model

# inputs of the decoder network
decoder_input_h = Input(shape=(2*latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(2*latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(2*latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.4, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the dense layer
#decoder_lstm_outputs = Dropout(0.2)(decoder_lstm_outputs)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

In [16]:
decoder_model.summary()

Model: "decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input_x (InputLayer)    (None, None, 30)     0                                            
__________________________________________________________________________________________________
decoder_input_h (InputLayer)    (None, 256)          0                                            
__________________________________________________________________________________________________
decoder_input_c (InputLayer)    (None, 256)          0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  293888      decoder_input_x[0][0]            
                                                                 decoder_input_h[0][0]      

In [17]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=[decoder_pred], 
              name='model_training')

In [18]:
print(state_h)
print(decoder_input_h)

Tensor("decoder_lstm/while:4", shape=(None, 256), dtype=float32)
Tensor("decoder_input_h:0", shape=(None, 256), dtype=float32)


In [19]:
model.summary()

Model: "model_training"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_x (InputLayer)    (None, None, 28)     0                                            
__________________________________________________________________________________________________
decoder_input_x (InputLayer)    (None, None, 30)     0                                            
__________________________________________________________________________________________________
encoder (Model)                 [(None, 256), (None, 160768      encoder_input_x[0][0]            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  293888      decoder_input_x[0][0]            
                                                                 encoder[1][0]       

In [None]:
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=0.002), loss='categorical_crossentropy')

model.fit_generator(generator=train_generator,
                    validation_data=val_generator,
                    use_multiprocessing=True,
                    workers=4,
                    epochs = 50)

model.save('seq2seq.h5')