### Word-level Sequence-to-Sequence for Machine Translation 

Use the same approch, but using words as input not characters

In [1]:
# Use a dataset of pairs of English sentences and their French translation

!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip -d fra-eng
!more fra-eng/fra.txt

--2019-07-19 08:50:45--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:30::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3467257 (3.3M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2019-07-19 08:50:47 (2.74 MB/s) - ‘fra-eng.zip.1’ saved [3467257/3467257]

Archive:  fra-eng.zip
replace fra-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace fra-eng/fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Go.	Va !
Hi.	Salut !
Hi.	Salut.
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !
Fire!	Au feu !
Help!	À l'aide !
Jump.	Saute.
Stop!	Ça suffit !
Stop!	Stop !
Stop!	Arrête-toi !
Wait!	Attends !
Wait!	Attendez !
Go on.	Poursuis.
Go on.	Continuez.
Go on.	Poursuivez.
Hello!	Bonjour !
Hello!	Salut !
I see.	Je comprends.
I try.	J'essaye.
I won!	J'ai gagné !
[K

In [0]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers.embeddings import Embedding
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'fra-eng/fra.txt'

# "start sequence" and "end_sequence" tokens
start_token = "sss"
end_token = "eee"

In [13]:
#----------------------------------------------
# Convert a string to an array of characters
#----------------------------------------------
def line2words (line):
   words = [word for word in line.split()]
   return words 

#----------------------------------------------
# Given an array of sentences, find the length of the longest sentence
#----------------------------------------------
def max_sentence_length (word_sentences):
  max_sentence_length = max ([len(word_sentence) for word_sentence in word_sentences])
  return (max_sentence_length)    

#----------------------------------------------
# Read num_samples lines from the data file, and return three lists of 
# sentences. Each sentence is an array of words.
#
# eng_char_sentences = [['Go.', 'eee'], ['Hi.', 'eee'], ...]
# fr_char_sentences_shift = [['Va', '!', 'eee'], ['Salut', '!', 'eee'], ...]
# fr_char_sentences = [['sss', 'Va', '!', 'eee'], ['sss', 'Salut', '!', 'eee'], ...]
#
# The 'eng_char_sentences' are the original English sentences and will be input to the Encoder
# The 'fr_char_sentences_shift' are the original French sentences and will be the target output of the Decoder
# The 'fr_char_sentences' are the original French sentences starting with a start-token and will be the input to the Decoder
#
# Note that we append an end-token to all sentences
#----------------------------------------------
def read_sentences (data_path, num_samples):
  eng_word_sentences = []
  fr_word_sentences = []
  fr_word_sentences_shift = []
  
  with open(data_path, "r") as fp:
    # Read the file line by line
    line = fp.readline()
  
    # We will read up to num_samples lines
    samples = 0
    while line and (samples < num_samples):
      # Split the line as <eng_sentence>TAB<fr_sentence>
      eng_sentence,fr_sentence = line.split('\t')
      eng_sentence = eng_sentence + ' ' + end_token
      fr_sentence = fr_sentence + ' ' + end_token
      fr_sentence_shift = fr_sentence
      fr_sentence = start_token + ' ' + fr_sentence
    
      # Convert the sentence from a string to an array of words
      eng_word_sentence = line2words (eng_sentence)
      fr_word_sentence = line2words (fr_sentence)
      fr_word_sentence_shift = line2words (fr_sentence_shift)
    
      # Add to the array of sentences, each sentence in turn being an array of characters
      eng_word_sentences.append (eng_word_sentence)
      fr_word_sentences.append (fr_word_sentence)
      fr_word_sentences_shift.append (fr_word_sentence_shift)
      
      # Increment samples read so far, and then read the next line
      samples = samples + 1
      line = fp.readline()
  
  return (eng_word_sentences, fr_word_sentences, fr_word_sentences_shift)

# Read sentences from the data file
eng_word_sentences, fr_word_sentences, fr_word_sentences_shift = read_sentences (data_path, num_samples)

# Get the length of the longest sentence
max_eng_sentence_length = max_sentence_length (eng_word_sentences)
max_fr_sentence_length = max_sentence_length (fr_word_sentences)

print (len (eng_word_sentences), len (fr_word_sentences))
print (eng_word_sentences[10], fr_word_sentences[10], fr_word_sentences_shift[10])
print (eng_word_sentences[75], fr_word_sentences[75], fr_word_sentences_shift[75])
print (max_eng_sentence_length, max_fr_sentence_length)

10000 10000
['Stop!', 'eee'] ['sss', 'Ça', 'suffit', '!', 'eee'] ['Ça', 'suffit', '!', 'eee']
['Awesome!', 'eee'] ['sss', 'Fantastique', '!', 'eee'] ['Fantastique', '!', 'eee']
6 12


In [14]:
#----------------------------------------------
# Create the vocab as a set of all the words in the corpus of sentences
#----------------------------------------------
def vocab (word_sentences):
  # Single flat list of all the words from all sentences
  merged_words = [word for word_sentence in word_sentences for word in word_sentence]
  
  # Create a set from the list, so that it contains only unique words
  vocab_words = sorted (set (merged_words))
  vocab_size = len (vocab_words)
  return (vocab_words, vocab_size)

# Get the set of characters in the vocab
eng_vocab_words, eng_vocab_size = vocab (eng_word_sentences)
fr_vocab_words, fr_vocab_size = vocab (fr_word_sentences)

print (eng_vocab_size, fr_vocab_size)

#----------------------------------------------
# Dictionary to map from char to char_index in the vocab
#----------------------------------------------
eng_vocab_dict = {word:i for i, word in enumerate (eng_vocab_words)}
fr_vocab_dict = {word:i for i, word in enumerate (fr_vocab_words)}

#----------------------------------------------
# Word -> index utility function
#----------------------------------------------
def word2idx (vocab_dict, word):
  return (vocab_dict [word])

#----------------------------------------------
# Index -> word utility function
#----------------------------------------------
def idx2word (vocab_words, i):
  return (vocab_words[i])
  
print (word2idx (eng_vocab_dict, 'Fire!'))
print (idx2word (eng_vocab_words, 27))

3102 5788
148
American.


In [23]:
#----------------------------------------------
# The training data consists of three array, two of which are 2D array of shape (num of samples, num of timesteps)
#
# Each sample is one sentence
#
# The input at each timestep will be a single word from a sentence. Since we will have a fixed 
# number of timesteps, it will be taken as the maximum number of words of any sentence. Each word 
# will be a single value viz. the index of that word
# 
# The third array, which is the target, is a 3D array of shape (num of samples, num of timesteps, num of input values per timestep)
# Since each character will be one-hot encoded from the vocab, the number of input values per timestep
# will be the size of the vocab
#
# The reason for the target array to be 3D is that the Decoder output predictions are 3D with
# the third dimension being the Softmax values for each possible word in the target vocabulary.
# Hence the target array also needs to be of matching 3D shape. It will have a 1 for the target word
# and a 0 for all the others ie. it will be one-hot encoded
#----------------------------------------------

eng_end_token_idx = word2idx (eng_vocab_dict, end_token)
fr_end_token_idx = word2idx (fr_vocab_dict, end_token)

# Create the training data of the right shape, and initialise all values to the end_token index
encoder_input_data = np.full((num_samples, max_eng_sentence_length), float(eng_end_token_idx), dtype='float32')
decoder_input_data = np.full((num_samples, max_fr_sentence_length), float(fr_end_token_idx), dtype='float32')

# Since the target values are one-hot encoded, we initialise all values to 0
decoder_target_data = np.zeros((num_samples, max_fr_sentence_length, fr_vocab_size), dtype='float32')

#----------------------------------------------
# Populate the training data 2D array
# We go through each sample (ie. first dimension) and each word in that sample (ie. second dimension)
# And for that word we convert it to its index.
#----------------------------------------------
def word_indices (data, word_sentences, vocab_dict):
  # Loop through each sentence (ie. each sample)
  for i, word_sentence in enumerate (word_sentences):
    
    # Loop through each word in each sentence
    for j, word in enumerate (word_sentence):
      
      # Get the word index for the word
      idx = word2idx (vocab_dict, word)
      
      # Set the array element for that word to its index
      data [i, j] = float(idx)

#----------------------------------------------
# Similar to the above function, but for a 3D array, where we then 
# one-hot encode the word index (third dimension)
#----------------------------------------------
def word_indices_onehot (data, word_sentences, vocab_dict):
  # Loop through each sentence (ie. each sample)
  for i, word_sentence in enumerate (word_sentences):
    
    # Loop through each word in each sentence
    for j, word in enumerate (word_sentence):
      
      # Get the word index for the word
      idx = word2idx (vocab_dict, word)
      
      # Set the array element for that word to its index
      data [i, j, idx] = 1.0

      
      
# Prepare all three arrays for training
word_indices (encoder_input_data, eng_word_sentences, eng_vocab_dict)
word_indices (decoder_input_data, fr_word_sentences, fr_vocab_dict)
word_indices_onehot (decoder_target_data, fr_word_sentences_shift, fr_vocab_dict)

print (eng_end_token_idx, fr_end_token_idx)
print (eng_word_sentences[3], fr_word_sentences[3])
print (encoder_input_data[3], decoder_input_data[3], decoder_target_data[3])

1186 2441
[ 316. 1186. 1186. 1186. 1186. 1186.] [5060.  216.    0. 2441. 2441. 2441. 2441. 2441. 2441. 2441. 2441. 2441.] [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] ['Run!', 'eee'] ['sss', 'Cours', '!', 'eee']


In [0]:
#----------------------------------------------
# Now that the training data has been prepared, we will build the a LSTM Encoder-Decoder model and
# train it to predict decoder_target_data given encoder_input_data and decoder_input_data.
#
# The training process and inference process (decoding sentences) are quite different, so we will 
# use different models for both. However both models will use the same inner layers
#
# We have one Embedding Layer and one Encoder LSTM layer
# We have another Embedding Layer, one Decoder LSTM layer and a Dense layer for predictions
# We build the model using Keras Functional API
#----------------------------------------------

#----------------------------------------------
# Build the Encoder Embedding layer to convert input word indices into word embeddings
# It takes the encoder_input_data as input
encoder_inputs = Input(shape=(None, ))
encoder_embedding = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)

#----------------------------------------------
# Build the Encoder - it is a LSTM layer that takes the encoder embeddings as input
# We discard its output and keep the states, so that they can be passed to the Decoder

# return_state=True tells the RNN layer to return a list where the first entry is the 
# outputs and the next entries are the internal RNN states
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
#----------------------------------------------

#----------------------------------------------
# Build the Decoder Embedding layer to convert input word indices into word embeddings
# It takes the decoder_input_data as input
decoder_inputs = Input(shape=(None, ))
decoder_embedding = Embedding(fr_vocab_size, latent_dim)(decoder_inputs)

#----------------------------------------------
# Build the Decoder - it is a LSTM layer that takes the decoder embeddings as input
# and uses the `encoder_states` as initial state.

# return_sequences=True tells the RNN layer to return its full sequence of outputs (instead 
# of just the last output, which the defaults behavior). We also return the internal
# states. We don't use those states in the training but we will use them during inference
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding,
                                     initial_state=encoder_states)
# Pass the decoder outputs through a Dense layer with Softmax to get the predictions
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
#----------------------------------------------

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [43]:
#----------------------------------------------
# We will build the Inference model using the Encoder-Decoder layers we created
# earlier. 
#----------------------------------------------

# This is simply to change some variable names
max_encoder_seq_length = max_eng_sentence_length
max_decoder_seq_length = max_fr_sentence_length
input_token_index = eng_vocab_dict
target_token_index = fr_vocab_dict
input_texts = eng_word_sentences

# ------------------------------------------
# Build the Encoder model, which returns the encoder states
encoder_model = Model(encoder_inputs, encoder_states)
# ------------------------------------------

# ------------------------------------------
# The Decoder will work in a loop. For its first iteration, its input sequence contains only
# the start token. And its initial state is the Encoder state. It then generates the
# next word output along with its own internal state.
# 
# Now for the next iteration, it appends the next word to the input sequence
# and uses that as input. It also takes its own internal state and feeds that back
# to itself as the initial state for the next iteration
#
# Hence, when building the model, we define the Decoder Initial State as an input
# variable (rather than using the Encoder state directly since we will take the Encoder state 
# only for the first time)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# Use the Decoder LSTM layer created during training, but with different inputs
# and initial states
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
# Use the Dense layer created during training to generate predictions from the
# Decoder's outputs
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
# ------------------------------------------

# Try to understand shapes
print (encoder_inputs, '\n', encoder_outputs, '\n', state_h, '\n', state_c)
print (decoder_inputs, '\n', decoder_outputs, '\n', decoder_states_inputs)

# ------------------------------------------
# Implements the Inference process. 
# ------------------------------------------
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = float(word2idx (target_token_index, start_token))

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        
        #print ('Target seq = ', target_seq.shape)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = idx2word (fr_vocab_words, sampled_token_index)
        decoded_sentence = decoded_sentence + ' ' + sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == end_token or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # KD - the original logic was to always use a target sequence
        # of length 1, which we update with the last sampled token. We
        # also feed the states_value back as the initial_state for the
        # next iteration
        #
        # The alternate logic which I tried was to keep extending the target
        # sequence in each iteration, so that the last sampled token is appended
        # to the previous target sequence. If we do that we don't change the
        # initial_state value at all, and continue to use the earlier value
        #
        # Both alternates are giving the same results. But I should experiment
        # some more, try out other unseen input sequences and see how both
        # alternates perform
        kdAlternate = True
        if (kdAlternate):
          b = np.zeros((1, target_seq.shape[1] + 1))
          b[:,:-1] = target_seq
          b[0, -1] = float(sampled_token_index)
          target_seq = b
        else:
          # Update the target sequence (of length 1).
          target_seq = np.zeros((1, 1, num_decoder_tokens))
          target_seq[0, 0, sampled_token_index] = 1.

          # Update states
          states_value = [h, c]

    return decoded_sentence


for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

Tensor("input_23:0", shape=(?, ?), dtype=float32) 
 Tensor("lstm_7/TensorArrayReadV3:0", shape=(?, 256), dtype=float32) 
 Tensor("lstm_8_1/while/Exit_2:0", shape=(?, 256), dtype=float32) 
 Tensor("lstm_8_1/while/Exit_3:0", shape=(?, 256), dtype=float32)
Tensor("input_24:0", shape=(?, ?), dtype=float32) 
 Tensor("dense_4_1/truediv:0", shape=(?, ?, 5788), dtype=float32) 
 [<tf.Tensor 'input_25:0' shape=(?, 256) dtype=float32>, <tf.Tensor 'input_26:0' shape=(?, 256) dtype=float32>]
-
Input sentence: ['Go.', 'eee']
Decoded sentence:  Va ! eee
-
Input sentence: ['Hi.', 'eee']
Decoded sentence:  Salut ! eee
-
Input sentence: ['Hi.', 'eee']
Decoded sentence:  Salut ! eee
-
Input sentence: ['Run!', 'eee']
Decoded sentence:  Cours ! eee
-
Input sentence: ['Run!', 'eee']
Decoded sentence:  Cours ! eee
-
Input sentence: ['Who?', 'eee']
Decoded sentence:  Qui ? eee
-
Input sentence: ['Wow!', 'eee']
Decoded sentence:  Ça alors ! eee
-
Input sentence: ['Fire!', 'eee']
Decoded sentence:  Au feu ! eee