In [1]:
from nltk.tokenize import word_tokenize
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

Using TensorFlow backend.


In [2]:
batch_size = 128
n_epoch = 10
latent_dim = 256
n_samples = 10000
data_path = 'fra-eng/fra.txt' # you have to download dataset from http://www.manythings.org/anki/

In [3]:
input_texts, target_texts = [], []
input_vocab, target_vocab = set(), set()
lines = open(data_path).read().split('\n')
for line in lines[:min(n_samples, len(lines) -1)]:
    in_txt, tg_txt = line.split('\t')
    tg_txt = '\t' + tg_txt + '\n' # \t for <start> word and \n for <end> word for Decoder
    input_texts.append(in_txt)
    target_texts.append(tg_txt)
    for w in word_tokenize(in_txt):
        if w not in input_vocab:
            input_vocab.add(w)
    for w in word_tokenize(tg_txt):
        if w not in target_vocab:
            target_vocab.add(w)

input_vocab = sorted(list(input_vocab) + ['\t', '\n'])
target_vocab = sorted(list(target_vocab) + ['\t', '\n']) 
n_input_vocab = len(input_vocab)
n_target_vocab = len(target_vocab)
max_encoder_seq_len = max([len(txt) for txt in input_texts])
max_decoder_seq_len = max([len(txt) for txt in target_texts])

In [4]:
print('n_input_vocab', n_input_vocab)
print('n_target_vocab', n_target_vocab)
print('max enc len', max_encoder_seq_len)
print('max dec len', max_decoder_seq_len)

n_input_vocab 2422
n_target_vocab 4935
max enc len 16
max dec len 59


In [5]:
input_w2i = {w:i for i,w in enumerate(input_vocab)}
input_i2w = {i:w for i,w in enumerate(input_vocab)}
target_w2i = {w:i for i,w in enumerate(target_vocab)}
target_i2w = {i:w for i,w in enumerate(target_vocab)}

In [7]:
enc_input_data = np.zeros( (len(input_texts), max_encoder_seq_len, n_input_vocab) )
dec_input_data = np.zeros( (len(input_texts), max_decoder_seq_len, n_target_vocab) )
dec_target_data = np.zeros( (len(input_texts), max_decoder_seq_len, n_target_vocab) )

for i, (in_text, tg_text) in enumerate(zip(input_texts, target_texts)):
    for t, w in enumerate(word_tokenize(in_text)):
        enc_input_data[i, t, input_w2i[w]] = 1.
    for t, w in enumerate(word_tokenize(tg_text)):
        dec_input_data[i, t, target_w2i[w]] = 1.
        if t > 0:
            dec_target_data[i, t-1, target_w2i[w]] = 1.

In [8]:
# Encoder
encoder_inputs = Input(shape=(None, n_input_vocab))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c] # discard encoder_outputs

# Decoder
decoder_inputs = Input(shape=(None, n_target_vocab))
decoder = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(n_target_vocab, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit([enc_input_data, dec_input_data], dec_target_data,
          batch_size=batch_size,
          epochs=2,
          validation_split=.2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None, 2422)    0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, None, 4935)    0                                            
____________________________________________________________________________________________________
lstm_1 (LSTM)                    [(None, 256), (None,  2743296     input_1[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                    [(None, None, 256), ( 5316608     input_2[0][0]                    
                                                                   lstm_1[0][1]            

<keras.callbacks.History at 0x7f2cfa173470>

In [9]:
model.save('seq2seq_word.h5')

# Inference

In [10]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
print('Encoder Model')
print(encoder_model.summary())
print('Decoder Model')
print(decoder_model.summary())

Encoder Model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 2422)        0         
_________________________________________________________________
lstm_1 (LSTM)                [(None, 256), (None, 256) 2743296   
Total params: 2,743,296
Trainable params: 2,743,296
Non-trainable params: 0
_________________________________________________________________
None
Decoder Model
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, None, 4935)    0                                            
____________________________________________________________________________________________________
input_3 (InputLayer)             (None, 256)           0                                   

In [11]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, n_target_vocab)) # (n_samples, seq_len, n_vocab)
    target_seq[0, 0, target_w2i['\t']] = 1. # start character
    
    stop_cond = False
    decoded_sentence = ''
    while not stop_cond:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_i2w[sampled_token_index]
        decoded_sentence += sampled_char
        
        if sampled_word == '\n' or len(decoded_sentence) > max_decoder_seq_len:
            stop_cond = True
        
        target_seq = np.zeros((1, 1, n_target_vocab))
        target_seq[0, 0, sampled_token_index] = 1.
        
        states_value = [h, c]
    
    return decoded_sentence

for seq_idx in range(100):
    input_seq = enc_input_data[seq_idx: seq_idx+1]
    decoded_sentence = decode_sequence(input_seq)
    print('----')
    print('input sentence:', input_texts[seq_idx])
    print('decoded sentence:', decoded_sentence)    

NameError: name 'sampled_char' is not defined