Datasets: [Tab-delimited Bilingual Sentence Pairs ](http://www.manythings.org/anki/)

In [1]:
import numpy as np
import pandas as pd
import string
import re

In [2]:
file_name = "./data/fra.txt"

In [3]:
df = pd.read_table(file_name, names=["en", "fra"], encoding="utf-8-sig")

In [4]:
df.head(3)

Unnamed: 0,en,fra
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !


## Word embedding
we make a dictionary of the words used in the text we want to process (or sometimes we use the most frequent 10,000 words of the text’s language.)<br>
To make it easy to understand what we are going to do, we will :
1. Convert text to <b>lowercase</b>
2. Clean data from <b>digits and punctuation</b>
3. Append <b>"SOS" (_Start of Sentence_)</b> and <b>"EOS" (_End of Sentence_)</b> to the target data
4. Make dictionaries to <b>convert words to indexed numbers</b>
5. Use <b>embedding layer</b> to convert each word to a fixed length vector.<b>(Word embeddings provide a dense representation of words and their relative meanings)</b>
6. Now, <b>the data is ready to be used by seq2seq network.</b>

### 1. Convert text to lowercase

In [5]:
df.iloc[0].en

'Go.'

In [6]:
df.en  = df.en.apply(lambda x:x.lower())
df.fra = df.fra.apply(lambda x:x.lower())

In [7]:
df.iloc[0].en

'go.'

### 2. Clean data from digits and punctuation

In [8]:
df.iloc[182].en

'me, too.'

In [9]:
df.en  = df.en.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))
df.fra = df.fra.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))

In [10]:
df.iloc[182].en

'me COMMA too.'

***

In [11]:
df.iloc[3].en

'run!'

In [12]:
exclude = set(string.punctuation)
df.en  = df.en.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df.fra = df.fra.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [13]:
df.iloc[3].en

'run'

***

In [14]:
df.iloc[51].en

'im 19'

In [15]:
remove_digits = str.maketrans('', '', string.digits)
df.en  = df.en.apply(lambda x: x.translate(remove_digits))
df.fra = df.fra.apply(lambda x: x.translate(remove_digits))

In [16]:
df.iloc[51].en

'im '

### 3. Append "SOS" _(Start of Sentence)_ and "EOS" _(End of Sentence)_ to the target data

In [17]:
df.fra.iloc[0]

'va '

In [18]:
# Appending SOS andEOS to target data :
df.fra = df.fra.apply(lambda x : 'SOS_ '+ x + ' _EOS')

In [19]:
df.fra.iloc[0]

'SOS_ va  _EOS'

### 4. Make dictionaries to convert words to indexed numbers

In [20]:
en_words=set()
for line in df.en:
    for word in line.split(" "):
        if word not in en_words:
            en_words.add(word)
    
fra_words=set()
for line in df.fra:
    for word in line.split(" "):
        if word not in fra_words:
            fra_words.add(word)

In [21]:
num_en_words  = len(en_words)
num_fra_words = len(fra_words)
print("num_en_words:  {}".format(num_en_words))
print("num_fra_words: {}".format(num_fra_words))

num_en_words:  14251
num_fra_words: 32357


In [22]:
max_en_words_per_sample  = max([len(sample.split(" ")) for sample in df.en])+5
max_fra_words_per_sample = max([len(sample.split(" ")) for sample in df.fra])+5
print("max_en_words_per_sample:  {}".format(max_en_words_per_sample))
print("max_fra_words_per_sample: {}".format(max_fra_words_per_sample))

max_en_words_per_sample:  54
max_fra_words_per_sample: 63


In [23]:
num_en_samples = len(df.en)
num_fra_samples = len(df.fra)
print("num_en_samples:  {}".format(num_en_samples))
print("num_fra_samples: {}".format(num_fra_samples))

num_en_samples:  167130
num_fra_samples: 167130


***

In [24]:
print("en words sample:  {}".format(list(en_words)[:3]))
print("fra words sample: {}".format(list(fra_words)[:3]))

en words sample:  ['', 'locusts', 'certain']
fra words sample: ['', 'durèrent', 'présent\xa0']


In [25]:
input_words = sorted(list(en_words))
target_words = sorted(list(fra_words))

In [26]:
print("en words sample:  {}".format(input_words[:3]))
print("fra words sample: {}".format(target_words[:3]))

en words sample:  ['', 'COMMA', 'COMMA\xa0ill']
fra words sample: ['', 'COMMA', 'COMMAenfila']


***

In [27]:
# Tokenizing the words (Convert to numbers)
en_token_to_int = dict()
en_int_to_token = dict()

for i,token in enumerate(input_words):
    en_token_to_int[token] = i
    en_int_to_token[i]     = token

In [28]:
en_int_to_token[en_token_to_int["COMMA"]] == "COMMA"

True

In [29]:
fra_token_to_int = dict()
fra_int_to_token = dict()
for i,token in enumerate(target_words):
    fra_token_to_int[token] = i
    fra_int_to_token[i]     = token

In [30]:
fra_int_to_token[fra_token_to_int["COMMA"]] == "COMMA"

True

***

In [31]:
# initiate numpy arrays to hold the data that our seq2seq model will use:
encoder_input_data  = np.zeros((num_en_samples,  max_en_words_per_sample),  dtype='float32')
decoder_input_data  = np.zeros((num_fra_samples, max_fra_words_per_sample), dtype='float32')
decoder_target_data = np.zeros((num_fra_samples, max_fra_words_per_sample, num_fra_words), dtype='float32')

In [32]:
print("encoder_input_data shape:  {}".format(encoder_input_data.shape))
print("decoder_input_data shape:  {}".format(decoder_input_data.shape))
print("decoder_target_data shape: {}".format(decoder_target_data.shape))

encoder_input_data shape:  (167130, 54)
decoder_input_data shape:  (167130, 63)
decoder_target_data shape: (167130, 63, 32357)


***

In [33]:
for i, (input_text, target_text) in enumerate(zip(df.en, df.fra)):
    """
    ex.)
    input_text ：Go
    target_text：SOS_ Va  _EOS
    """
    for t, word in enumerate(input_text.split(" ")):
        """ ex.) word：Go """
        encoder_input_data[i, t] = en_token_to_int[word]
    for t, word in enumerate(target_text.split(" ")):
        """ ex.) word：SOS_ """
        decoder_input_data[i, t] = fra_token_to_int[word]
        if t > 0:
            decoder_target_data[i, t - 1, fra_token_to_int[word]] = 1.

***

In [34]:
encoder_input_data[0]

array([5420.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
      dtype=float32)

In [35]:
decoder_input_data[0]

array([6.0000e+00, 3.0359e+04, 0.0000e+00, 7.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00], dtype=float32)

In [36]:
decoder_target_data[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

***

### 5. Use embedding layer to convert each word to a fixed length vector

In [37]:
import keras
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [38]:
# Defining some constants: 
vec_len       = 300   # Length of the vector that we willl get from the embedding layer
latent_dim    = 1024  # Hidden layers dimension 
dropout_rate  = 0.2   # Rate of the dropout layers
batch_size    = 64    # Batch size
epochs        = 30    # Number of epochs

## Build the encoder.

|Layer name|Brief content|
|:--|:--|
|Input Layer|Takes the English sentence and pass it to the embedding layer.|
|Embedding Layer|  Takes the English sentence and convert each word to fixed size vector.|
|First LSTM Layer|  Every time step, it takes a vector that represents a word and pass its output to the next layer, We used CuDNNLSTM layer instead of LSTM because it’s much much faster.|
|Second LSTM Layer|  It does the same thing as the previous layer, but instead of passing its output, it passes its states to the first LSTM layer of the decoder.|

<summary>figure is below;
    <details><img src="https://cdn-images-1.medium.com/max/1600/1*3pH2NH_8i7QMxpV0TFOdxw.jpeg"></details>
</summary>

In [39]:
# Input layer:
encoder_input = keras.layers.Input(shape=(None,))

# Hidden layers
encoder_embedding = keras.layers.Embedding(input_dim=num_en_words, output_dim=vec_len)(encoder_input) 
encoder_dropout   = (keras.layers.TimeDistributed(keras.layers.Dropout(rate=dropout_rate)))(encoder_embedding)
encoder_LSTM      = keras.layers.LSTM(latent_dim, return_sequences=True)(encoder_dropout)

# Output layer:
encoder_LSTM2_layer = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM2_layer(encoder_LSTM)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


***

## Building the decoder.

|Layer name|Brief content|
|:--|:--|
|Input Layer | Takes the French sentence and pass it to the embedding layer.|
|Embedding Layer | Takes the French sentence and convert each word to fixed size vector|
|First LSTM Layer | Every time step, it takes a vector that represents a word and pass its output to the next layer, but here in the decoder, we initialize the state of this layer to be the last state of the last LSTM layer from the decoder .|
|Second LSTM Layer | Processing the output from the previous layer and passes its output to a dense layer .|
|Dense Layer (Output Layer) | Takes the output from the previous layer and outputs a one hot vector representing the target French word|

<summary>figure is below;
    <details><img src="https://cdn-images-1.medium.com/max/1600/1*sDlV9_-PXBlt8jol-7Xjhg.jpeg"></details>
</summary>

In [40]:
# Input layer:
decoder_input = keras.layers.Input(shape=(None,))

# Hidden layers:
decoder_embedding_layer = keras.layers.Embedding(input_dim=num_fra_words, output_dim=vec_len)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_dropout_layer = (keras.layers.TimeDistributed(keras.layers.Dropout(rate = dropout_rate)))
decoder_dropout = decoder_dropout_layer(decoder_embedding)

decoder_LSTM_layer = keras.layers.LSTM(latent_dim, return_sequences=True)
decoder_LSTM = decoder_LSTM_layer(decoder_dropout, initial_state=encoder_states)

decoder_LSTM_2_layer = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_LSTM_2,_,_ = decoder_LSTM_2_layer(decoder_LSTM)

# Output layer:
decoder_dense = keras.layers.Dense(num_fra_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_LSTM_2)

***

<div style="padding:10px;border-radius: 10px; border: 5px double #FA8072;">
<b>Note :</b><br>
We have to know that we don’t convert each English sentence into French in one time step, we do that in a number of time steps that equals the number of words that the longest English sentence has.<br>
so if the longest English sentence has 10 words, we have to take 10 time steps to get its French translation.
</div>

## Bringing the encoder and decoder together into one model

In [41]:
model = Model([encoder_input, decoder_input], decoder_outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    4275300     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 300)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
embedding_

In [42]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file="./img/model.png", show_shapes=True)

<summary>figure is below;
    <details><img src="./img/model.png"></details>
</summary>

In [43]:
# Define a checkpoint callback :
checkpoint_name = './weights/Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

## Training

In [44]:
num_train_samples = 9000

```python
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data[:num_train_samples,:],
               decoder_input_data[:num_train_samples,:]],
               decoder_target_data[:num_train_samples,:,:],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.08,
          callbacks = callbacks_list)
```