The dataset is taken from the site: http://www.manythings.org/anki/

There are many datasets which could have been taken. I selected English to French as it had enough samples to train the model effectively.

In [0]:
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import io

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,Concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [0]:
from google.colab import files
uploaded = files.upload()

Saving fra.txt to fra (1).txt


In [0]:
df = pd.read_table(io.BytesIO(uploaded['fra.txt']),header = None)

In [0]:
#Dataset for the model

df.columns = ['English','French']
df.head()

Unnamed: 0,English,French
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [0]:
# Setting Parameters

BATCH_SIZE = 32            # Batch size for the training set. After each BATCH_SIZE the weights will be updated
EPOCHS = 100               # Number of times we will train the model
LSTM_units = 256           # Output nits for the LSTM Layer
MAX_SEQUENCE_LENGTH = 100  # Maximum number of words in a single sentence
VOCAB_SIZE = 20000         # Vocab size for the dataset
EMBEDDING_DIM = 50         # Embedding units to represent a single word in English
EMBEDDING_DIM_FRENCH = 100 # Embedding units to represent a single word in French

In [0]:
english_text_input = df['English'][:15000].values                                 # Input for the training Encoder
french_text_input = df['French'][:15000].apply(lambda x:'<sos> ' + x).values      # Input for the training Decoder
french_text_output = df['French'][:15000].apply(lambda x:x + ' <eos>').values     # Output for the training Decoder

# We are using <sos> and <eos> as we will be using Teacher Forcing.

In [0]:
print("Number of samples for the training data: ",len(english_text_input))  # 170190 samples

Number of samples for the training data:  15000


In [0]:
# Tokenize the English Text
tokenizer_english = Tokenizer(num_words=VOCAB_SIZE)
sentences = tokenizer_english.fit_on_texts(english_text_input)
english_sequences_input = tokenizer_english.texts_to_sequences(english_text_input)
english_sequences_input[:5]

[[18], [668], [668], [146], [146]]

The words are now converted into numbers

In [0]:
# Get the word to index Mapping
word2idx_english = tokenizer_english.word_index
print('Unique english words: ',len(word2idx_english))    # Identified 14384 unique letters

Unique english words:  2921


In [0]:
#Tokenize the French Text
tokenizer_french = Tokenizer(num_words=VOCAB_SIZE,filters = '')
sentences = tokenizer_french.fit_on_texts(french_text_input + french_text_output)  # We use both the dataset so that <sos> and <eos> are also included in the tokenize set of words
french_sequences_input = tokenizer_french.texts_to_sequences(french_text_input)
french_sequences_output = tokenizer_french.texts_to_sequences(french_text_output)
print('Input: ',french_sequences_input[:5])
print('Output: ',french_sequences_output[:5])

Input:  [[1, 58, 6], [1, 1208, 6], [1], [1], [1]]
Output:  [[58, 6, 2], [1208, 6, 2], [2], [2], [2]]


In [0]:
# Get the word to index Mapping

word2idx_french = tokenizer_french.word_index
print('Unique french words: ',len(word2idx_french))    # Identified 14384 unique letters


Unique french words:  16547


In [0]:
# Getting Max length for both the texts
max_english = max(len(s) for s in english_sequences_input)   # Max sequence length in English
max_french = max(len(s) for s in french_sequences_input)     # Max sequence length in French
num_words_french = len(word2idx_french) + 1                  # Possible outputs for the french language

In [0]:
# Padding the sequences

encode_english_input = pad_sequences(english_sequences_input,maxlen = max_english,padding = 'post')
decode_french_input = pad_sequences(french_sequences_input,maxlen = max_french,padding = 'post')
decode_french_output = pad_sequences(french_sequences_output,maxlen = max_french,padding = 'post')

# Size of the input and output

print('Size of Encode Input: ',encode_english_input.shape)
print('Size of Decode Input: ',decode_french_input.shape)
print('Size of Decode Output: ',decode_french_output.shape)


Size of Encode Input:  (15000, 5)
Size of Decode Input:  (15000, 12)
Size of Decode Output:  (15000, 12)


In [0]:
# Loading pretrained word vector

word2vec = {}
with open(os.path.join('/content/drive/My Drive/Colab Notebooks/Dataset/glove.6B.50d.txt')) as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))




Found 400000 word vectors.


In [0]:
num_words = min(VOCAB_SIZE, len(word2idx_english) + 1)

print('Number of words',num_words)

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # Creating the embedding matrix with each word having dimension of 50
print('Shape of Embedding Matrix',embedding_matrix.shape)

for word, i in word2idx_english.items():
  if i < VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector



Number of words 2922
Shape of Embedding Matrix (2922, 50)


In [0]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.18910000e-01,  1.52549997e-01, -8.20730031e-02, ...,
        -5.75119972e-01, -2.66710013e-01,  9.21209991e-01],
       [-1.09190005e-03,  3.33240002e-01,  3.57430011e-01, ...,
        -4.56970006e-01, -4.89690006e-02,  1.13160002e+00],
       ...,
       [ 2.15690002e-01, -9.00229990e-01,  6.82510018e-01, ...,
         4.65460002e-01,  1.81079999e-01, -1.22239999e-01],
       [ 9.63559985e-01, -5.39669991e-01,  2.77429998e-01, ...,
        -3.87650013e-01,  1.31150007e-01,  6.29419982e-01],
       [-4.29910004e-01,  5.82780004e-01, -8.21919963e-02, ...,
        -6.38769984e-01, -6.83719963e-02, -8.71749997e-01]])

In [0]:
# from google.colab import drive
# drive.mount('/content/drive')

# MODEL CREATION

In [0]:
# Embedding Layer

embedding_layer = Embedding(input_dim = num_words, output_dim = EMBEDDING_DIM, weights = [embedding_matrix],input_length = max_english)


W0723 20:32:34.286412 140518383687552 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [0]:
# Creating the target variable

decoder_target_onehot = np.zeros((len(english_text_input),max_french,num_words_french),dtype = 'float32')
print('Shape: ',decoder_target_onehot.shape)   

# 10000 represents each sequence
# 11 represents max length in english
# 11903 represents max length in french

Shape:  (15000, 12, 16548)


In [0]:
# Assiging the values to the output

for i,d in enumerate(decode_french_output):
  for t,word in enumerate(d):
    decoder_target_onehot[i,t,word] = 1

## Adding Layers

In [0]:
encoder_input = Input(shape = (max_english,))  # Assigning input for the encoder

print(encoder_input.shape)

(?, 5)


In [0]:
x = embedding_layer(encoder_input)            # Embedding the words from the input
print(x.shape)

# Each of the words will now be represented by 50 values

(?, 5, 50)


In [0]:
encoder_lstm = Bidirectional(LSTM(LSTM_units,return_state = True,dropout = 0.2))   # return_states is True so that it will return hidden state and cell values to pass it to decoder
encoder_output,hidden_state_f,hidden_state_b,cell_state_f,cell_state_b= encoder_lstm(x)

print(encoder_output.shape)    # Only the output from the last cell is taken hence shape is (?,256)



(?, 512)


In [0]:
hidden_state = Concatenate()([hidden_state_f, hidden_state_b])
cell_state = Concatenate()([cell_state_f, cell_state_b])

encoder_states = [hidden_state,cell_state]   # Storing the hidden state and cell state so that it can be passed to decoder as initial state


TensorShape([Dimension(None), Dimension(512)])

In [0]:
decoder_input = Input(shape = (max_french,))   # Assigning input for the decoder

print(decoder_input.shape)

(?, 12)


In [0]:
decoder_embedding = Embedding(num_words_french,EMBEDDING_DIM_FRENCH)   # Performing Embedding for French
decoder_input_emb = decoder_embedding(decoder_input)                   # Embedding the words from the input 

print(decoder_input_emb.shape)

# Each of the words will now be represented by 100 values

(?, 12, 100)


In [0]:
decoder_lstm = LSTM(LSTM_units*2,return_sequences = True,return_state = True,dropout = 0.2)

# return sequences is True so that it returns output at every instance
# return state is True so that the state from given time step can be passed onto the next time step

decoder_output,_,_= decoder_lstm(decoder_input_emb,initial_state = encoder_states)

# Not storing hidden state and cell state as it is not required now but will be used while doing the prediction

print(decoder_output.shape)

(?, ?, 512)


In [0]:
# Dense layer to get the probability of each word

decoder_dense_layer_1 = Dense(2048,activation = 'relu')
decoder_output = decoder_dense_layer_1(decoder_output)
print(decoder_output.shape) 

decoder_dense_layer_2 = Dense(1024,activation = 'relu')
decoder_output = decoder_dense_layer_2(decoder_output)
print(decoder_output.shape) 

decoder_dense_layer_3 = Dense(512,activation = 'relu')
decoder_output = decoder_dense_layer_3(decoder_output)
print(decoder_output.shape) 

decoder_dense_layer_4 = Dense(256,activation = 'relu')
decoder_output = decoder_dense_layer_4(decoder_output)
print(decoder_output.shape) 

decoder_dense = Dense(num_words_french,activation = 'softmax')
decoder_output = decoder_dense(decoder_output)

print(decoder_output.shape) 

# Each word in the sequence is now represent by 11903 probabilities. The word with the highest probability will be selected.

(?, 12, 2048)
(?, 12, 1024)
(?, 12, 512)
(?, 12, 256)
(?, 12, 16548)


In [0]:
model = Model([encoder_input,decoder_input],decoder_output)

model.compile(optimizer = 'rmsprop',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [0]:
r = model.fit([encode_english_input,decode_french_input],decoder_target_onehot,epochs = EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [0]:
model.save('EngToFra.h5')

In [0]:
encoder_model = Model(encoder_input,encoder_states)   # Creating an encoder_model so that we can take the hidden states and cell states for the input sequence

In [0]:
decoder_state_h = Input(shape = (LSTM_units*2,))   # Setting input of hidden state for the decoder for predicting
decoder_state_c = Input(shape = (LSTM_units*2,))   # Setting input of cell state for the decoder for predicting

decoder_state_input = [decoder_state_h,decoder_state_c]
print(decoder_state_input[1].shape)
print(encoder_states[1].shape)

(?, 512)
(?, 512)


In [0]:
decoder_input_word = Input(shape = (1,))     # Expecting one word for the input

print(decoder_input_word.shape)

(?, 1)


In [0]:
decoder_input_word_emb = decoder_embedding(decoder_input_word)   # Embedding the input for the decoder

print(decoder_input_word_emb.shape)

(?, 1, 100)


In [0]:
decoder_output,hidden_state_decoder,cell_state_decoder = decoder_lstm(decoder_input_word_emb,initial_state = decoder_state_input)

decoder_state = [hidden_state_decoder,cell_state_decoder]

print(decoder_output.shape)

(?, ?, 512)


In [0]:
decoder_output = decoder_dense_layer_1(decoder_output)
decoder_output = decoder_dense_layer_2(decoder_output)
decoder_output = decoder_dense_layer_3(decoder_output)
decoder_output = decoder_dense_layer_4(decoder_output)

decoder_output = decoder_dense(decoder_output)

print(decoder_output.shape)

(?, 1, 16548)


In [0]:
decoder_model = Model([decoder_input_word] + decoder_state_input,[decoder_output] + decoder_state)

In [0]:
idx2words_english = {v:k for k,v in word2idx_english.items()}   # mapping from integer to word for english
idx2words_french = {v:k for k,v in word2idx_french.items()}     # mapping from integer to word for french


In [0]:
def decode_sequences(input_sequence):
  
  encode_states = encoder_model.predict(input_sequence)  # To get the states for the particular input sequence
  
  target = np.zeros((1,1))   # Creating a space for the target word or the next word that will be predicted. Since a single word will be predicted the shape is (1,1)
  
  target[0,0] = word2idx_french['<sos>']  # since we will be staring with <sos>
  
  eos_idx = word2idx_french['<eos>']          # If we find this stop predicting
  
  output_sequence = []   # To store the entire predicted sequence
  
  for _ in range(max_french):
    
    output_prob,h,c = decoder_model.predict([target] + encode_states)   # Predicting the next word and storing h and c for further use
    
    idx_of_predicted_word = np.argmax(output_prob[0,0,:])    # Taking the probability for all the words

    if eos_idx == idx_of_predicted_word:         # If we found the end of sentence tag
      break
    
    if idx_of_predicted_word > 0:                                # Since 0 will be stored for unknown words
      predicted_word = idx2words_french[idx_of_predicted_word]   # Converting the predicted index to its corresponding word
      output_sequence.append(predicted_word)                     # Storing it into the sequence
      
    
    target[0,0] = idx_of_predicted_word
    
    encode_states = [h,c]
  
  return " ".join(output_sequence)
    
    

In [0]:
while(True):
  
  i = np.random.choice(len(english_text_input))
  
  input_sequence = encode_english_input[i:i+1]
  
  translation = decode_sequences(input_sequence)
  
  print('-')
  
  print('Input: ', english_text_input[i])
  print('Expected: ', french_text_input[i])
  print('Translation: ', translation)
  
  ans = input("Continue?")
  
  if ans and ans.lower().startswith('n'):
    break
    
    
  

-
Input:  I hate to lose.
Expected:  <sos> Je déteste perdre.
Translation:  je déteste parler.
Continue?y
-
Input:  Go back to bed.
Expected:  <sos> Retournez au lit !
Translation:  retourne au lit !
Continue?y
-
Input:  I'm by your side.
Expected:  <sos> Je suis à vos côtés.
Translation:  je suis à votre côté.
Continue?y
-
Input:  See below.
Expected:  <sos> Voir ci-dessous.
Translation:  voyez ci-dessous.
Continue?n


In [0]:
text = ['I love you']

question = pad_sequences(tokenizer_english.texts_to_sequences(text),maxlen=max_english,padding = 'post')
decode_sequences(question)

"je t'aime !"