<a href="https://colab.research.google.com/github/jihokwak/nlp/blob/master/seq2seq_word_level_with_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from keras import layers, models
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [0]:
tf.compat.v1.disable_eager_execution()

In [0]:
import numpy as np
import pandas as pd
import os

In [0]:
input_texts = []
target_texts = []

In [0]:
batch_size = 128
epochs = 100
latent_dim = 50
num_samples = 10000

In [0]:
import os
CUR_DIR = os.path.abspath(".")
FILE_NAME = os.path.join(CUR_DIR, "fra-eng.txt")

In [0]:
import requests
resp = requests.get("https://raw.githubusercontent.com/jinfagang/pytorch_chatbot/master/datasets/eng-fra.txt")
with open(FILE_NAME, "wb") as f :
  f.write(resp.content)

In [0]:
with open(FILE_NAME, 'r', encoding='utf-8') as f:
    lines = f.read().split("\n")

In [0]:
lines = sorted(lines, key=len)

In [0]:
for idx, line in enumerate(lines):
    if len(line.split("\t")) < 2 :
        continue
    input_text, target_text = line.split("\t")[:2]
    input_texts.append(input_text)
    target_texts.append(target_text)

In [0]:
lines = pd.DataFrame({'eng':input_texts, 'fra':target_texts})
lines = lines[: min(num_samples, len(lines) - 1)]

In [59]:
lines.shape

(10000, 2)

In [0]:
#Data Cleanup
lines.eng=lines.eng.apply(lambda x: x.lower())
lines.fra=lines.fra.apply(lambda x: x.lower())

In [0]:
# Take the length as 50
import re
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))
lines.fra=lines.fra.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))

In [0]:
import string
exclude = set(string.punctuation)
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.fra=lines.fra.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [0]:
remove_digits = str.maketrans('', '', string.digits)
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.fra=lines.fra.apply(lambda x: x.translate(remove_digits))

In [0]:
#Generate synthetic data
lines.fra = lines.fra.apply(lambda x: 'START_ ' + x + ' _END')

In [0]:
# Create vocabulary of words
all_eng_words = set()
for eng in lines.eng:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

In [0]:
all_fra_words = set()
for fra in lines.fra:
    for word in fra.split():
        if word not in all_fra_words:
            all_fra_words.add(word)

In [74]:
print(len(all_eng_words), len(all_fra_words))

2386 4545


In [0]:
max_encoder_seq_length = np.max([len(l.split()) for l in lines.eng])
max_decoder_seq_length = np.max([len(l.split()) for l in lines.fra])

In [78]:
print(max_encoder_seq_length,max_decoder_seq_length)

6 8


In [0]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_fra_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_fra_words)

In [80]:
print(num_encoder_tokens,num_decoder_tokens)

2386 4545


In [0]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

In [0]:
encoder_input_data = np.zeros((len(lines.eng), max_encoder_seq_length),dtype='float32')
decoder_input_data = np.zeros((len(lines.fra), max_decoder_seq_length),dtype='float32')
decoder_target_data = np.zeros((len(lines.fra), max_decoder_seq_length, num_decoder_tokens), dtype="float16")

In [0]:
for i, (input_text, target_text) in enumerate(zip(lines.eng, lines.fra)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[word]] = 1

In [0]:
encoder_inputs = layers.Input(shape=(None,))
en_x = layers.Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder = layers.GRU(latent_dim, return_state=True)
_, state_h = encoder(en_x)

In [0]:
decoder_inputs = layers.Input(shape=(None,))
de_x = layers.Embedding(num_decoder_tokens, latent_dim)
final_dex = de_x(decoder_inputs)

In [0]:
decoder_gru = layers.GRU(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(final_dex, initial_state=state_h)

In [0]:
decoder_dense = layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [0]:
model = models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [90]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     119300      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     227250      input_2[0][0]                    
__________________________________________________________________________________________________
gru_1 (GRU

In [0]:
BASE_DIR = os.path.abspath(".")
WORK_DIR = "seq2seq_keras"
os.makedirs(os.path.join(BASE_DIR, WORK_DIR), exist_ok=True)
os.makedirs(os.path.join(BASE_DIR, WORK_DIR, "model"), exist_ok=True)
callbacks = [
    ModelCheckpoint(filepath=os.path.join(BASE_DIR, WORK_DIR, "model", "seq2seq2_gru_word_level_model.h5"), monitor='loss', save_best_only=True)
]

In [92]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 9000 samples, validate on 1000 samples
Epoch 1/100


  '. They will not be included '


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x7ff17fa12cc0>

In [93]:
encoder_model = models.Model(encoder_inputs, state_h)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 50)          119300    
_________________________________________________________________
gru_1 (GRU)                  [(None, 50), (None, 50)]  15150     
Total params: 134,450
Trainable params: 134,450
Non-trainable params: 0
_________________________________________________________________


In [0]:
decoder_state_input_h = layers.Input(shape=(latent_dim,))

final_de_x2 = de_x(decoder_inputs)

decoder_outputs2, state_h2 = decoder_gru(final_de_x2, initial_state=decoder_state_input_h)
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = models.Model(
    [decoder_inputs, decoder_state_input_h],
    [decoder_outputs2, state_h2])

In [0]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq, states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = h

    return decoded_sentence