In [1]:
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU, TimeDistributed, Activation
from keras.layers.embeddings import Embedding
import numpy as np

Using TensorFlow backend.


In [37]:
batch_size = 128
n_epoch = 10
latent_dim = 200
n_samples = 10000
data_path = 'fra-eng/fra.txt' # you have to download dataset from http://www.manythings.org/anki/

In [63]:
input_texts, target_texts = [], []
input_vocab, target_vocab = set(), set()
lines = open(data_path).read().split('\n')
for line in lines[:min(n_samples, len(lines) -1)]:
    in_txt, tg_txt = line.split('\t', 1)
    in_txt = word_tokenize(in_txt)
    tg_txt = ['\t'] + word_tokenize(tg_txt) + ['\n']
#     tg_txt = '\t' + tg_txt + '\n' # \t for <start> word and \n for <end> word for Decoder
    input_texts.append(in_txt)
    target_texts.append(tg_txt)
    for w in in_txt:
        if w not in input_vocab:
            input_vocab.add(w)
    for w in tg_txt:
        if w not in target_vocab:
            target_vocab.add(w)

input_vocab = sorted(list(input_vocab))
target_vocab = sorted(list(target_vocab)) 
n_input_vocab = len(input_vocab)
n_target_vocab = len(target_vocab)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])

In [64]:
print('n_input_vocab', n_input_vocab)
print('n_target_vocab', n_target_vocab)
print('max enc len', max_encoder_seq_len)
print('max dec len', max_decoder_seq_len)

n_input_vocab 2420
n_target_vocab 4935
max enc len 6
max dec len 14


In [65]:
print(input_texts[0])
print(target_texts[0])

['Go', '.']
['\t', 'Va', '!', '\n']


In [66]:
input_w2i = {w:i for i,w in enumerate(input_vocab)}
input_i2w = {i:w for i,w in enumerate(input_vocab)}
target_w2i = {w:i for i,w in enumerate(target_vocab)}
target_i2w = {i:w for i,w in enumerate(target_vocab)}

In [68]:
enc_input_data = np.zeros( (len(input_texts), max_encoder_seq_len) )
dec_input_data = np.zeros( (len(input_texts), max_decoder_seq_len) )
dec_target_data = np.zeros( (len(input_texts), max_decoder_seq_len, n_target_vocab) )

for i, (in_text, tg_text) in enumerate(zip(input_texts, target_texts)):
    for t, w in enumerate(in_text):
        enc_input_data[i, t] = input_w2i[w]
    for t, w in enumerate(tg_text):
        dec_input_data[i, t] = target_w2i[w]
        if t > 0:
            dec_target_data[i, t-1, target_w2i[w]] = 1.

In [69]:
enc_input_data[0].shape

(6,)

In [71]:
print('enc_input_data:', enc_input_data.shape) # (N, seq_len, vocab_size)
# Encoder
embd_size = 100
encoder_inputs = Input(shape=(max_encoder_seq_len, ), name='EncoderInput')
print('encInputs', encoder_inputs.shape)
enc_input_embedding = Embedding(input_dim=n_input_vocab, output_dim=embd_size, input_length=max_encoder_seq_len, name='EncoderEmbedding')
embedded_enc_inputs = enc_input_embedding(encoder_inputs)
print('embd_enc_inputs', embedded_enc_inputs.shape)
encoder = GRU(latent_dim, return_state=True, name='EncoderRNN')
encoder_outputs, encoder_state_h = encoder(embedded_enc_inputs)
# encoder_states = [state_h] # The decoder use only last output of the encoder and discard encoder_outputs
# print(state_h.shape, encoder_states[0].shape)

# Decoder
decoder_inputs = Input(shape=(max_decoder_seq_len,), name='DecoderInput')
dec_output_embedding = Embedding(input_dim=n_target_vocab, output_dim=embd_size, input_length=max_decoder_seq_len, name='DecoderEmbedding')
embedded_dec_inputs = Activation('relu')(dec_output_embedding(decoder_inputs))
decoder = GRU(latent_dim, return_sequences=True, return_state=True, name='DecoderGRU')
print('embedded_dec_inputs', embedded_dec_inputs.shape)
decoder_outputs, _ = decoder(embedded_dec_inputs, initial_state=encoder_state_h)
# decoder_dense = TimeDistributed(Dense(n_target_vocab, activation='softmax', name='FinalDense'))
decoder_dense = TimeDistributed(Dense(n_target_vocab, activation='softmax', name='FinalDense'))
decoder_outputs = decoder_dense(decoder_outputs)
print('decoder_coutputs', decoder_outputs.shape)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit([enc_input_data, dec_input_data], dec_target_data,
          batch_size=batch_size,
          epochs=1,
          validation_split=.2)

enc_input_data: (10000, 6)
encInputs (?, 6)
embd_enc_inputs (?, 6, 100)
embedded_dec_inputs (?, 14, 100)
decoder_coutputs (?, 14, 4935)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
DecoderInput (InputLayer)       (None, 14)           0                                            
__________________________________________________________________________________________________
EncoderInput (InputLayer)       (None, 6)            0                                            
__________________________________________________________________________________________________
DecoderEmbedding (Embedding)    (None, 14, 100)      493500      DecoderInput[0][0]               
__________________________________________________________________________________________________
EncoderEmbedding (Embedding)    (None, 6, 100)       242000      Encoder

<keras.callbacks.History at 0x7f18e0a01a58>

In [72]:
model.save('seq2seq_word.h5')

  str(node.arguments) + '. They will not be included '


# Inference

In [78]:
encoder_model = Model(encoder_inputs, encoder_state_h)
decoder_state_input_h = Input(shape=(max_decoder_seq_len, latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h]
embedded_dec_inputs_2 = dec_output_embedding(decoder_inputs)
print('decoder_inputs', decoder_inputs.shape)
print('embedded_dec_inputs_2', embedded_dec_inputs_2.shape)
decoder_outputs2, decoder_state_h = decoder(embedded_dec_inputs_2, initial_state=decoder_state_input_h)
print('decoder_outputs2', decoder_outputs2.shape)
# decoder_states = [state_h]
decoder_outputs = decoder_dense(decoder_outputs2)
print('dec_inputs:', decoder_inputs.shape)
print('dec_state-input_h:', decoder_state_input_h.shape)
decoder_model = Model(
    [decoder_inputs] + decoder_state_input_h,
    [decoder_outputs] + decoder_states
)
print('Encoder Model')
print(encoder_model.summary())
print('Decoder Model')
print(decoder_model.summary())

decoder_inputs (?, 14)
embedded_dec_inputs_2 (?, 14, 100)
decoder_outputs2 (?, ?, 14, 200)


ValueError: Input 0 is incompatible with layer time_distributed_4: expected ndim=3, found ndim=4

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, n_target_vocab)) # (n_samples, seq_len, n_vocab)
    target_seq[0, 0, target_w2i['\t']] = 1. # start character
    
    stop_cond = False
    decoded_sentence = ''
    while not stop_cond:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_i2w[sampled_token_index]
        decoded_sentence += sampled_char
        
        if sampled_word == '\n' or len(decoded_sentence) > max_decoder_seq_len:
            stop_cond = True
        
        target_seq = np.zeros((1, 1, n_target_vocab))
        target_seq[0, 0, sampled_token_index] = 1.
        
        states_value = [h, c]
    
    return decoded_sentence

for seq_idx in range(100):
    input_seq = enc_input_data[seq_idx: seq_idx+1]
    decoded_sentence = decode_sequence(input_seq)
    print('----')
    print('input sentence:', input_texts[seq_idx])
    print('decoded sentence:', decoded_sentence)    