In [1]:
!wget -c http://www.manythings.org/anki/fra-eng.zip && unzip -o fra-eng.zip

--2022-01-13 09:05:36--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.92.44, 172.67.186.54, 2606:4700:3030::6815:5c2c, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.92.44|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6532197 (6.2M) [application/zip]
Saving to: ‘fra-eng.zip’


2022-01-13 09:05:36 (66.8 MB/s) - ‘fra-eng.zip’ saved [6532197/6532197]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [2]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM
import numpy as np

In [3]:
df = pd.read_csv('fra.txt', sep='\t', header=None)
df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)
df.columns = ['English', 'French']

df = df[0:60000]
df.sample(10)

Unnamed: 0,English,French
17228,Close the window.,Fermez la fenêtre !
52355,We're behind schedule.,Nous sommes en retard sur l'emploi du temps.
51782,Tom didn't feel tired.,Tom ne s'est pas senti fatigué.
54203,Don't make me kill you.,Ne m'obligez pas à vous tuer !
57208,Let's go and say hello.,Allons-y dire bonjour.
21838,You'll need this.,Tu auras besoin de ça.
11457,She loves cats.,Elle adore les chats.
23493,I have a daughter.,J'ai une fille.
299,Help me.,Aide-moi.
20217,The room is dark.,La pièce est sombre.


In [4]:
# <sos> : start of sentence (use '\t')
# <eos> : end of sentence   (use '\n')

df.French = df.French.apply(lambda x: '\t ' + x + ' \n')
df.sample(10)

Unnamed: 0,English,French
29780,I was not drinking.,\t Je n'étais pas en train de boire. \n
24384,I'm proud of that.,\t Je suis fière de cela. \n
54579,He is a very smart boy.,\t C'est un garçon très intelligent. \n
29435,I love my children.,\t J'aime mes enfants. \n
55838,I read a lot of novels.,\t Je lis de nombreux romans. \n
31000,She put on her hat.,\t Elle a mis son chapeau. \n
39490,You should dump Tom.,\t Tu devrais rompre avec Tom. \n
41649,I have a good salary.,\t J'ai un bon salaire. \n
34992,I can't swim at all.,\t Je ne sais pas du tout nager. \n
786,Tom left.,\t Tom est parti. \n


In [5]:
# English character set
eng_vocab = set()   # use set instead of list for uniqueness
for sent in df.English:
  for char in sent:
    eng_vocab.add(char)

# French character set
fra_vocab = set()
for sent in df.French:
  for char in sent:
    fra_vocab.add(char)

In [6]:
ENG_vocab_size = len(eng_vocab) + 1
FRA_vocab_size = len(fra_vocab) + 1

print('No. of English char: ', ENG_vocab_size)
print('No. of French char: ', FRA_vocab_size)

No. of English char:  80
No. of French char:  105


In [7]:
# Assign index to each characters
# The characters need to be sorted

ENG_vocab = sorted(list(eng_vocab))
FRA_vocab = sorted(list(fra_vocab))
print(ENG_vocab[30:40])
print(FRA_vocab[30:40])

['H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q']
['E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']


In [8]:
# make dictionary of {char : index}
ENG_idx_dict = dict([(char, i + 1) for i, char in enumerate(ENG_vocab)])
FRA_idx_dict = dict([(char, i + 1) for i, char in enumerate(FRA_vocab)])

print(ENG_idx_dict)
print(FRA_idx_dict)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, '°': 76, 'é': 77, '’': 78, '€': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 3

In [9]:
# Encoder ("English" -> Encoder input)
# Change every sentence in to a sequence of indexes
# Make a list of list --> [ [...], [...], [...] ]

# Encoder input

ENCODER_input = []
for sent in df.English:   # sentence level
  encoded_sent = []
  for english_char in sent:       # character level
    encoded_sent.append(ENG_idx_dict[english_char])
  
  ENCODER_input.append(encoded_sent)

for i in ENCODER_input[:10]:
  print('Encoded English sentence: ', i)

Encoded English sentence:  [30, 64, 10]
Encoded English sentence:  [30, 64, 10]
Encoded English sentence:  [30, 64, 10]
Encoded English sentence:  [31, 58, 10]
Encoded English sentence:  [31, 58, 10]
Encoded English sentence:  [41, 70, 63, 2]
Encoded English sentence:  [41, 70, 63, 2]
Encoded English sentence:  [41, 70, 63, 2]
Encoded English sentence:  [41, 70, 63, 2]
Encoded English sentence:  [41, 70, 63, 2]


In [10]:
# Decoder input

DECODER_input = []
for sent in df.French:
  encoded_sent = []
  for french_char in sent:
    encoded_sent.append(FRA_idx_dict[french_char])

  DECODER_input.append(encoded_sent)

for i in DECODER_input[:5]:
  print('Encoded French sentence: ', i)

Encoded French sentence:  [1, 3, 48, 53, 3, 4, 3, 2]
Encoded French sentence:  [1, 3, 39, 53, 70, 55, 60, 57, 14, 3, 2]
Encoded French sentence:  [1, 3, 28, 67, 73, 59, 57, 3, 4, 3, 2]
Encoded French sentence:  [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2]
Encoded French sentence:  [1, 3, 45, 53, 64, 73, 72, 14, 3, 2]


In [11]:
# We can actually remove the "<sos>" token 
# according to the decoder target structure

# Decoder Target

DECODER_TARGET = []
for sent in df.French:
  encoded_sent = []
  for french_char in sent:
    encoded_sent.append(FRA_idx_dict[french_char])
  
  encoded_sent = encoded_sent[1:]   # remove <sos> token
  DECODER_TARGET.append(encoded_sent)

for i in DECODER_TARGET[:5]:
  print('Target French sentence: ', i)

Target French sentence:  [3, 48, 53, 3, 4, 3, 2]
Target French sentence:  [3, 39, 53, 70, 55, 60, 57, 14, 3, 2]
Target French sentence:  [3, 28, 67, 73, 59, 57, 3, 4, 3, 2]
Target French sentence:  [3, 45, 53, 64, 73, 72, 3, 4, 3, 2]
Target French sentence:  [3, 45, 53, 64, 73, 72, 14, 3, 2]


In [12]:
max_ENG_len = max([len(sent) for sent in df.English])
max_FRA_len = max([len(sent) for sent in df.French])

print("MAX English sentence length: ", max_ENG_len)
print("MAX French sentence length: ", max_FRA_len)

MAX English sentence length:  23
MAX French sentence length:  76


In [13]:
# Process padding to match the sentence lengths
ENCODER_input = pad_sequences(ENCODER_input, maxlen=max_ENG_len, padding='post')
DECODER_input = pad_sequences(DECODER_input, maxlen=max_FRA_len, padding='post')
DECODER_target = pad_sequences(DECODER_TARGET, maxlen=max_FRA_len, padding='post')


In [14]:
from tensorflow.keras.utils import to_categorical

# Process one-hot-encoding
ENCODER_input = to_categorical(ENCODER_input)
DECODER_input = to_categorical(DECODER_input)
DECODER_target = to_categorical(DECODER_target)


In [15]:
ENCODER_input[0][0:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

### TRAINING Seq2Seq Model

#### Encoder LSTM

In [16]:
# LSTM(units, return_state, return_sequences)
# "units": dimensionality of output space
# "return_state": whether to return the last state in addition to the output
# "return_sequences": whether to return the LAST OUTPUT
#                     (in the output sequence or full sequence)

# state_h: hidden state (SHORT TERM memory)
# state_c: cell state (LONG TERM memory)


ENCODER_inputs = Input(shape=(None, ENG_vocab_size))
ENCODER_lstm = LSTM(units=256, return_state=True)

ENCODER_outputs, state_h, state_c = ENCODER_lstm(ENCODER_inputs)

ENCODER_states = [state_h, state_c]

ENCODER_outputs

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm')>

#### Decoder LSTM

In [17]:
DECODER_inputs = Input(shape=(None, FRA_vocab_size))
DECODER_lstm = LSTM(units=256, return_sequences=True, return_state=True)

DECODER_outputs, _, _ = DECODER_lstm(DECODER_inputs, initial_state=ENCODER_states)

DECODER_softmax_layer = Dense(FRA_vocab_size, activation='softmax')
DECODER_outputs = DECODER_softmax_layer(DECODER_outputs)


In [18]:
INPUTS = [ENCODER_inputs, DECODER_inputs]
OUTPUTS = DECODER_outputs

model = Model(INPUTS, OUTPUTS)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 80)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 105)]  0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        345088      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [19]:
INPUTS

[<KerasTensor: shape=(None, None, 80) dtype=float32 (created by layer 'input_1')>,
 <KerasTensor: shape=(None, None, 105) dtype=float32 (created by layer 'input_2')>]

In [20]:
print(ENCODER_input.shape)
print(DECODER_input.shape)
print(DECODER_target.shape)

(60000, 23, 80)
(60000, 76, 105)
(60000, 76, 105)


In [21]:
print(ENCODER_inputs)
print(DECODER_inputs)
print(DECODER_outputs)

KerasTensor(type_spec=TensorSpec(shape=(None, None, 80), dtype=tf.float32, name='input_1'), name='input_1', description="created by layer 'input_1'")
KerasTensor(type_spec=TensorSpec(shape=(None, None, 105), dtype=tf.float32, name='input_2'), name='input_2', description="created by layer 'input_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, None, 105), dtype=tf.float32, name=None), name='dense/Softmax:0', description="created by layer 'dense'")


In [22]:
x = [ENCODER_input, DECODER_input]

model.fit(x=x, 
          y=DECODER_target, 
          batch_size=64,
          epochs=40,
          validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f2bfc40c0d0>

### TESTING the trained Seq2Seq Model
* The structure of the Seq2Seq model is different when it is TRAINED and when it is TESTED. 
* When the English sentence enters the ENCODER, it returns the "hidden state" and the "cell state"
* Send the "\t", which corresponds to 'SOS' token is sent to the DECODER
* The translation is processed until the sentence meets "\n", which corresponds to 'EOS' token

In [23]:
encoder_model = Model(inputs=ENCODER_inputs, outputs=ENCODER_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 80)]        0         
                                                                 
 lstm (LSTM)                 [(None, 256),             345088    
                              (None, 256),                       
                              (None, 256)]                       
                                                                 
Total params: 345,088
Trainable params: 345,088
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Tensor that will store the previous states
DECODER_state_input_h = Input(shape=(256,))
DECODER_state_input_c = Input(shape=(256,))
DECODER_states_inputs = [DECODER_state_input_h, DECODER_state_input_c]

DECODER_outputs, state_h, state_c = DECODER_lstm(DECODER_inputs, 
                                                 initial_state=DECODER_states_inputs)

# Store the hidden state and the cell state
# which we did not use in the training process
DECODER_states_outputs = [state_h, state_c]
DECODER_outputs = DECODER_softmax_layer(DECODER_outputs)

# Input: We use both the Decoder inputs list & Decoder state at INPUT
# Output: Decoder output list & Decoder state at OUTPUT
INPUTS = [DECODER_inputs] + DECODER_states_inputs
OUTPUTS = [DECODER_outputs] + DECODER_states_outputs

DECODER_model = Model(inputs=INPUTS,
                      outputs=OUTPUTS)

DECODER_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None, 105)]  0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 lstm_1 (LSTM)                  [(None, None, 256),  370688      ['input_2[0][0]',                
                                 (None, 256),                     'input_3[0][0]',          

In [25]:
# Previous dictionary: {token : idx}
# Modified dictionary: {idx : token}

idx_ENG_dict = dict((y, x) for x,y in ENG_idx_dict.items())
idx_FRA_dict = dict((y, x) for x,y in FRA_idx_dict.items())

print(idx_ENG_dict)
print(idx_FRA_dict)

{1: ' ', 2: '!', 3: '"', 4: '$', 5: '%', 6: '&', 7: "'", 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: '?', 24: 'A', 25: 'B', 26: 'C', 27: 'D', 28: 'E', 29: 'F', 30: 'G', 31: 'H', 32: 'I', 33: 'J', 34: 'K', 35: 'L', 36: 'M', 37: 'N', 38: 'O', 39: 'P', 40: 'Q', 41: 'R', 42: 'S', 43: 'T', 44: 'U', 45: 'V', 46: 'W', 47: 'X', 48: 'Y', 49: 'Z', 50: 'a', 51: 'b', 52: 'c', 53: 'd', 54: 'e', 55: 'f', 56: 'g', 57: 'h', 58: 'i', 59: 'j', 60: 'k', 61: 'l', 62: 'm', 63: 'n', 64: 'o', 65: 'p', 66: 'q', 67: 'r', 68: 's', 69: 't', 70: 'u', 71: 'v', 72: 'w', 73: 'x', 74: 'y', 75: 'z', 76: '°', 77: 'é', 78: '’', 79: '€'}
{1: '\t', 2: '\n', 3: ' ', 4: '!', 5: '"', 6: '$', 7: '%', 8: '&', 9: "'", 10: '(', 11: ')', 12: ',', 13: '-', 14: '.', 15: '0', 16: '1', 17: '2', 18: '3', 19: '4', 20: '5', 21: '6', 22: '7', 23: '8', 24: '9', 25: ':', 26: '?', 27: 'A', 28: 'B', 29: 'C', 30: 'D', 31: 'E', 32: 'F', 33: 'G', 34: 'H

In [26]:
def decoding_func(INPUT_seq):
  # Receive state from the input of the ENCODER
  states_value = encoder_model.predict(INPUT_seq)

  # Create One-Hot Vector for <SOS> ('\t')
  target_sequence = np.zeros((1, 1, FRA_vocab_size))
  SOS_idx = FRA_idx_dict['\t']
  target_sequence[0, 0, SOS_idx] = 1

  stop_condition = False
  decoded_sentence = ""

  # Repeat the loop until it meets the stop_condition as True
  while not stop_condition:
    # Use the state value of the previous timestep
    # as the initial state of the current timestep
    output_tokens, h, c = DECODER_model.predict([target_sequence] + states_value)

    # Convert predicted sequence as CHARACTERS
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = idx_FRA_dict[sampled_token_index]

    # Add the predicted character to the decoded sentence
    decoded_sentence += sampled_char

    # Stop the loop if
    #   1. Meets <EOS> ("\n")
    #   2. Went over the length of current sentence
    if (sampled_char == '\n' or len(decoded_sentence) > max_FRA_len):
      stop_condition = True
    
    # Use current predicted character as INPUT of the next timestep
    target_sequence = np.zeros((1, 1, FRA_vocab_size))
    target_sequence[0, 0, sampled_token_index] = 1

    # Store the current state_value to use for the next timestep
    states_value = [h, c]
  
  return decoded_sentence



In [27]:
for index in [3, 50, 100, 300, 1001]:
  input_sequence = ENCODER_input[index : index + 1]
  decoded_sent = decoding_func(input_sequence)

  print(20*"-")

  input_sentence = df.English[index]
  answer = df.French[index][2:len(df.French[index]) - 1]
  predicted_sentence = decoded_sent[1:len(decoded_sent) - 1]

  print("Input sentence: ", input_sentence)
  print("Answer sentence: ", answer)
  print("Predicted sentence: ", predicted_sentence) 

--------------------
Input sentence:  Hi.
Answer sentence:  Salut ! 
Predicted sentence:  Salut. 
--------------------
Input sentence:  I see.
Answer sentence:  Aha. 
Predicted sentence:  Je le vois. 
--------------------
Input sentence:  Hug me.
Answer sentence:  Serrez-moi dans vos bras ! 
Predicted sentence:  Serrez-moi dans vos partir ! 
--------------------
Input sentence:  Help me.
Answer sentence:  Aidez-moi. 
Predicted sentence:  Aide-moi. 
--------------------
Input sentence:  I am sure.
Answer sentence:  Je suis sûr. 
Predicted sentence:  Je suis terrible. 


In [28]:
for index in [10, 20, 30, 40, 50]:
  input_sequence = ENCODER_input[index : index + 1]
  decoded_sent = decoding_func(input_sequence)

  print(20*"-")

  input_sentence = df.English[index]
  answer = df.French[index][2:len(df.French[index]) - 1]
  predicted_sentence = decoded_sent[1:len(decoded_sent) - 1]

  print("Input sentence: ", input_sentence)
  print("Answer sentence: ", answer)
  print("Predicted sentence: ", predicted_sentence) 

--------------------
Input sentence:  Run!
Answer sentence:  Cours ! 
Predicted sentence:  File ! 
--------------------
Input sentence:  Run.
Answer sentence:  Fuyons ! 
Predicted sentence:  Cours ! 
--------------------
Input sentence:  Jump!
Answer sentence:  Saute. 
Predicted sentence:  Saute ! 
--------------------
Input sentence:  Wait.
Answer sentence:  Attends. 
Predicted sentence:  Attends ! 
--------------------
Input sentence:  I see.
Answer sentence:  Aha. 
Predicted sentence:  Je le vois. 
