In [1]:
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU, TimeDistributed, Activation
from keras.layers.embeddings import Embedding
import numpy as np

Using TensorFlow backend.


In [2]:
batch_size = 128
n_epoch = 10
latent_dim = 100
n_samples = 10000
data_path = 'fra-eng/fra.txt' # you have to download dataset from http://www.manythings.org/anki/

In [4]:
input_texts, target_texts = [], []
input_vocab, target_vocab = set(), set()
lines = open(data_path).read().split('\n')
for line in lines[:min(n_samples, len(lines) -1)]:
    in_txt, tg_txt = line.split('\t', 1)
    in_txt = word_tokenize(in_txt)
    tg_txt = ['\t'] + word_tokenize(tg_txt) + ['\n']
    input_texts.append(in_txt)
    target_texts.append(tg_txt)
    for w in in_txt:
        if w not in input_vocab:
            input_vocab.add(w)
    for w in tg_txt:
        if w not in target_vocab:
            target_vocab.add(w)

input_vocab = ['UNK'] + sorted(list(input_vocab))
target_vocab = ['UNK'] + sorted(list(target_vocab)) 
n_input_vocab = len(input_vocab)
n_target_vocab = len(target_vocab)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])

In [5]:
print('n_input_vocab', n_input_vocab)
print('n_target_vocab', n_target_vocab)
print('max enc len', max_encoder_seq_len)
print('max dec len', max_decoder_seq_len)

n_input_vocab 2421
n_target_vocab 4936
max enc len 6
max dec len 14


In [7]:
input_w2i = {w:i for i,w in enumerate(input_vocab)}
input_i2w = {i:w for i,w in enumerate(input_vocab)}
target_w2i = {w:i for i,w in enumerate(target_vocab)}
target_i2w = {i:w for i,w in enumerate(target_vocab)}

In [8]:
enc_input_data = np.zeros( (len(input_texts), max_encoder_seq_len) )
dec_input_data = np.zeros( (len(input_texts), max_decoder_seq_len) )
dec_target_data = np.zeros( (len(input_texts), max_decoder_seq_len, n_target_vocab) )

for i, (in_text, tg_text) in enumerate(zip(input_texts, target_texts)):
    for t, w in enumerate(in_text):
        enc_input_data[i, t] = input_w2i[w]
    for t, w in enumerate(tg_text):
        dec_input_data[i, t] = target_w2i[w]
        if t > 0:
            dec_target_data[i, t-1, target_w2i[w]] = 1.

In [9]:
print('input:', input_texts[0])
print('enc in:', enc_input_data[0])
print('target:', target_texts[0])
print('enc tg:', dec_target_data[0])

input: ['Go', '.']
enc in: [ 165.   11.    0.    0.    0.    0.]
target: ['\t', 'Va', '!', '\n']
enc tg: [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [13]:
print('enc_input_data:', enc_input_data.shape) # (N, seq_len, vocab_size)
# Encoder
encoder_inputs = Input(shape=(None, ), name='EncoderInput')
enc_input_embedding = Embedding(input_dim=n_input_vocab, output_dim=latent_dim, name='EncoderEmbedding')
embedded_enc_inputs = enc_input_embedding(encoder_inputs)
encoder = GRU(latent_dim, return_state=True, name='EncoderRNN')
encoder_outputs, encoder_state_h = encoder(embedded_enc_inputs)

# Decoder
decoder_inputs = Input(shape=(None,), name='DecoderInput')
dec_output_embedding = Embedding(input_dim=n_target_vocab, output_dim=latent_dim, name='DecoderEmbedding')
embedded_dec_inputs = dec_output_embedding(decoder_inputs)
decoder = GRU(latent_dim, return_sequences=True, return_state=True, name='DecoderGRU')
decoder_outputs, _ = decoder(embedded_dec_inputs, initial_state=encoder_state_h)
decoder_dense = Dense(n_target_vocab, activation='softmax', name='FinalDense')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

enc_input_data: (10000, 6)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
EncoderEmbedding (Embedding)    (None, None, 100)    242100      EncoderInput[0][0]               
__________________________________________________________________________________________________
DecoderEmbedding (Embedding)    (None, None, 100)    493600      DecoderInput[0][0]               
__________________________________________________________________________________

In [14]:
model.fit([enc_input_data, dec_input_data], dec_target_data,
          batch_size=batch_size,
          epochs=30,
          validation_split=.2)
model.save('seq2seq_word.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  str(node.arguments) + '. They will not be included '


# Inference

In [15]:
encoder_model = Model(encoder_inputs, encoder_state_h) # ((?, ?), (?, 100))

decoder_state_input_h = Input(shape=(latent_dim,), name='decoder_state_input_h')
decoder_inputs = Input(shape=(None,), name='InfDecoderInput')
embedded_dec_inputs = dec_output_embedding(decoder_inputs)
decoder_outputs, decoder_state_h = decoder(embedded_dec_inputs, initial_state=decoder_state_input_h)
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs, decoder_state_input_h],
    [decoder_outputs, decoder_state_h],
    name='InfDecoderModel'
)
print('Encoder Model')
print(encoder_model.summary())
print('Decoder Model')
print(decoder_model.summary())

Encoder Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncoderInput (InputLayer)    (None, None)              0         
_________________________________________________________________
EncoderEmbedding (Embedding) (None, None, 100)         242100    
_________________________________________________________________
EncoderRNN (GRU)             [(None, 100), (None, 100) 60300     
Total params: 302,400
Trainable params: 302,400
Non-trainable params: 0
_________________________________________________________________
None
Decoder Model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
InfDecoderInput (InputLayer)    (None, None)         0                                            
______________________________________________________________________

In [28]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1)) # (n_samples, seq_len)
    target_seq[0, 0] = target_w2i['\t'] # start character
    
    stop_cond = False
    decoded_sentence = ''
    while not stop_cond:
        output_tokens, h = decoder_model.predict([target_seq, states_value])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_i2w[sampled_token_index]
        decoded_sentence += ' ' + sampled_word
        
        if sampled_word == '\n' or len(decoded_sentence) > max_decoder_seq_len:
            if decoded_sentence[-1] == '\n':
                decoded_sentence = decoded_sentence[:-1]
            stop_cond = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = h
    
    return decoded_sentence

for seq_idx in range(10):
    input_seq = enc_input_data[seq_idx: seq_idx+1]
    decoded_sentence = decode_sequence(input_seq)
    print('----')
    print('input sentence:', input_texts[seq_idx])
    print('decoded sentence:', decoded_sentence)
    print('correct sentence:', ' '.join(target_texts[seq_idx][1:-1]))

----
input sentence: ['Go', '.']
decoded sentence:  Fais un verre !
correct sentence: Va !
----
input sentence: ['Run', '!']
decoded sentence:  Bien joué ! 
correct sentence: Cours !
----
input sentence: ['Run', '!']
decoded sentence:  Bien joué ! 
correct sentence: Courez !
----
input sentence: ['Wow', '!']
decoded sentence:  Ça a l'air bien
correct sentence: Ça alors !
----
input sentence: ['Fire', '!']
decoded sentence:  Quelle ? 
correct sentence: Au feu !
----
input sentence: ['Help', '!']
decoded sentence:  Allons-y ! 
correct sentence: À l'aide !
----
input sentence: ['Jump', '.']
decoded sentence:  Parle avec moi
correct sentence: Saute .
----
input sentence: ['Stop', '!']
decoded sentence:  Arrêtez ! 
correct sentence: Ça suffit !
----
input sentence: ['Stop', '!']
decoded sentence:  Arrêtez ! 
correct sentence: Stop !
----
input sentence: ['Stop', '!']
decoded sentence:  Arrêtez ! 
correct sentence: Arrête-toi !
