In [1]:
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU, TimeDistributed, Activation
from keras.layers.embeddings import Embedding
import numpy as np

Using TensorFlow backend.


In [2]:
batch_size = 128
n_epoch = 10
latent_dim = 100
n_samples = 10000
data_path = 'fra-eng/fra.txt' # you have to download dataset from http://www.manythings.org/anki/

In [3]:
input_texts, target_texts = [], []
input_vocab, target_vocab = set(), set()
lines = open(data_path).read().split('\n')
for line in lines[:min(n_samples, len(lines) -1)]:
    in_txt, tg_txt = line.split('\t', 1)
    in_txt = word_tokenize(in_txt)
    tg_txt = ['\t'] + word_tokenize(tg_txt) + ['\n']
#     tg_txt = '\t' + tg_txt + '\n' # \t for <start> word and \n for <end> word for Decoder
    input_texts.append(in_txt)
    target_texts.append(tg_txt)
    for w in in_txt:
        if w not in input_vocab:
            input_vocab.add(w)
    for w in tg_txt:
        if w not in target_vocab:
            target_vocab.add(w)

input_vocab = sorted(list(input_vocab))
target_vocab = sorted(list(target_vocab)) 
n_input_vocab = len(input_vocab)
n_target_vocab = len(target_vocab)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])

In [4]:
print('n_input_vocab', n_input_vocab)
print('n_target_vocab', n_target_vocab)
print('max enc len', max_encoder_seq_len)
print('max dec len', max_decoder_seq_len)

n_input_vocab 2420
n_target_vocab 4935
max enc len 6
max dec len 14


['Go', '.']
['\t', 'Va', '!', '\n']


In [6]:
input_w2i = {w:i for i,w in enumerate(input_vocab)}
input_i2w = {i:w for i,w in enumerate(input_vocab)}
target_w2i = {w:i for i,w in enumerate(target_vocab)}
target_i2w = {i:w for i,w in enumerate(target_vocab)}

In [7]:
enc_input_data = np.zeros( (len(input_texts), max_encoder_seq_len) )
dec_input_data = np.zeros( (len(input_texts), max_decoder_seq_len) )
dec_target_data = np.zeros( (len(input_texts), max_decoder_seq_len, n_target_vocab) )

for i, (in_text, tg_text) in enumerate(zip(input_texts, target_texts)):
    for t, w in enumerate(in_text):
        enc_input_data[i, t] = input_w2i[w]
    for t, w in enumerate(tg_text):
        dec_input_data[i, t] = target_w2i[w]
        if t > 0:
            dec_target_data[i, t-1, target_w2i[w]] = 1.

In [46]:
print('input:', input_texts[0])
print('enc in:', enc_input_data[0])
print('target:', target_texts[0])
print('enc tg:', dec_target_data[0])

input: ['Go', '.']
enc in: [ 164.   10.    0.    0.    0.    0.]
target: ['\t', 'Va', '!', '\n']
enc tg: [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [51]:
print('enc_input_data:', enc_input_data.shape) # (N, seq_len, vocab_size)
# Encoder
# embd_size = 100
encoder_inputs = Input(shape=(None, ), name='EncoderInput')
print('encInputs', encoder_inputs.shape)
# enc_input_embedding = Embedding(input_dim=n_input_vocab, output_dim=latent_dim, input_length=max_encoder_seq_len, name='EncoderEmbedding')
enc_input_embedding = Embedding(input_dim=n_input_vocab, output_dim=latent_dim, name='EncoderEmbedding')
embedded_enc_inputs = enc_input_embedding(encoder_inputs)
print('embd_enc_inputs', embedded_enc_inputs.shape)
encoder = GRU(latent_dim, return_state=True, name='EncoderRNN')
encoder_outputs, encoder_state_h = encoder(embedded_enc_inputs)
print('encoder_outputs', encoder_outputs.shape)
print('encoder_state_h', encoder_state_h.shape)
# encoder_states = [state_h] # The decoder use only last output of the encoder and discard encoder_outputs
# print(state_h.shape, encoder_states[0].shape)

# Decoder
decoder_inputs = Input(shape=(None,), name='DecoderInput')
# dec_output_embedding = Embedding(input_dim=n_target_vocab, output_dim=embd_size, input_length=max_decoder_seq_len, name='DecoderEmbedding')
dec_output_embedding = Embedding(input_dim=n_target_vocab, output_dim=latent_dim, name='DecoderEmbedding')
# embedded_dec_inputs = Activation('relu')(dec_output_embedding(decoder_inputs))
embedded_dec_inputs = dec_output_embedding(decoder_inputs)
decoder = GRU(latent_dim, return_sequences=True, return_state=True, name='DecoderGRU')
print('embedded_dec_inputs', embedded_dec_inputs.shape)
decoder_outputs, _ = decoder(embedded_dec_inputs, initial_state=encoder_state_h)
decoder_dense = Dense(n_target_vocab, activation='softmax', name='FinalDense')
decoder_outputs = decoder_dense(decoder_outputs)
print('decoder_coutputs', decoder_outputs.shape)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

enc_input_data: (10000, 6)
encInputs (?, ?)
embd_enc_inputs (?, ?, 100)
encoder_outputs (?, 100)
encoder_state_h (?, 100)
embedded_dec_inputs (?, ?, 100)
decoder_coutputs (?, ?, 4935)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
EncoderEmbedding (Embedding)    (None, None, 100)    242000      EncoderInput[0][0]               
__________________________________________________________________________________________________
DecoderEmbedding (Embedd

In [71]:
model.fit([enc_input_data, dec_input_data], dec_target_data,
          batch_size=batch_size,
          epochs=15,
          validation_split=.2)
model.save('seq2seq_word.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


  str(node.arguments) + '. They will not be included '


# Inference

In [72]:
encoder_model = Model(encoder_inputs, encoder_state_h) # ((?, ?), (?, 100))

decoder_state_input_h = Input(shape=(latent_dim,), name='decoder_state_input_h')
decoder_inputs = Input(shape=(None,), name='InfDecoderInput')
embedded_dec_inputs = dec_output_embedding(decoder_inputs)
decoder_outputs, decoder_state_h = decoder(embedded_dec_inputs, initial_state=decoder_state_input_h)
decoder_outputs = decoder_dense(decoder_outputs)
print('decoder_inputs:', decoder_inputs.shape)
print('embedded_dec_inputs', embedded_dec_inputs.shape)
print('decoder_state_input_h:', decoder_state_input_h.shape)
print('decoder_outputs:', decoder_outputs.shape)
print('decoder_state_h', decoder_state_h.shape)

# decoder_inputs2 = Input(shape=(max_decoder_seq_len,), name='DecoderInput')
decoder_model = Model(
#     [decoder_inputs] + decoder_state_input_h,
    [decoder_inputs, decoder_state_input_h],
    [decoder_outputs, decoder_state_h],
    name='InfDecoderModel'
)
print('Encoder Model')
print(encoder_model.summary())
print('Decoder Model')
print(decoder_model.summary())

decoder_inputs: (?, ?)
embedded_dec_inputs (?, ?, 100)
decoder_state_input_h: (?, 100)
decoder_outputs: (?, ?, 4935)
decoder_state_h (?, 100)
Encoder Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncoderInput (InputLayer)    (None, None)              0         
_________________________________________________________________
EncoderEmbedding (Embedding) (None, None, 100)         242000    
_________________________________________________________________
EncoderRNN (GRU)             [(None, 100), (None, 100) 60300     
Total params: 302,300
Trainable params: 302,300
Non-trainable params: 0
_________________________________________________________________
None
Decoder Model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
InfDecoderInput (InputLayer

In [73]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
#     target_seq = np.zeros((1, n_target_vocab)) # (n_samples, seq_len, n_vocab)
    target_seq = np.zeros((1, 1)) # (n_samples, seq_len, n_vocab)

#     target_seq[0, target_w2i['\t']] = 1. # start character
    target_seq[0, 0] = target_w2i['\t'] # start character
    
    stop_cond = False
    decoded_sentence = ''
    ct = 0
    while not stop_cond:
        output_tokens, h = decoder_model.predict([target_seq, states_value])
        if ct == 0:
            print('[target_seq, states_value]', target_seq.shape, states_value.shape)
            print('output_tokens', output_tokens.shape)
            print('h', h.shape)
            ct  += 1

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_i2w[sampled_token_index]
        decoded_sentence += ' ' + sampled_word
        
        if sampled_word == '\n' or len(decoded_sentence) > max_decoder_seq_len:
            stop_cond = True
        
#         target_seq = np.zeros((1, n_target_vocab))
#         target_seq[0, sampled_token_index] = 1.
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = h
    
    return decoded_sentence

for seq_idx in range(10):
    input_seq = enc_input_data[seq_idx: seq_idx+1]
    decoded_sentence = decode_sequence(input_seq)
    print('----')
    print('input sentence:', input_texts[seq_idx])
    print('decoded sentence:', decoded_sentence)    

[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Go', '.']
decoded sentence:  Va le chercher
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Run', '!']
decoded sentence:  Va chercher votre
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Run', '!']
decoded sentence:  Va chercher votre
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Wow', '!']
decoded sentence:  Vous êtes en train
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Fire', '!']
decoded sentence:  Laissez tomber
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sentence: ['Help', '!']
decoded sentence:  Laissez tomber
[target_seq, states_value] (1, 1) (1, 100)
output_tokens (1, 1, 4935)
h (1, 100)
----
input sen