In [None]:
# importing libraries
import pandas as pd
import numpy as np
from keras.models import Model
import re
import random
import string
from keras.callbacks import EarlyStopping

In [None]:
data1 = "/content/small_vocab_en"
data2 = "/content/small_vocab_fr"
with open(data1, 'r', encoding = 'utf-8') as f:
  lines = f.read().strip().split('\n')
with open(data2, 'r', encoding = 'utf-8') as f:
  line1 = f.read().strip().split('\n')

In [None]:
# data cleaning
lines = [" ".join(re.findall(r"[A-Za-z0-9]+",line)) for line in lines]
line1 = [" ".join(re.findall(r"[A-Za-z0-9]+",line)) for line in line1]
pairs = list(zip(lines, line1))

In [None]:
import numpy as np
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()
for line in pairs[:500]:
  
  input_doc, target_doc = line[0], line[1]
  input_docs.append(input_doc)
  target_doc = '<START> ' +target_doc + ' <END>'
  target_docs.append(target_doc)
  for token in input_doc.split():
    if token not in input_tokens:
      input_tokens.add(token)
  for token in target_doc.split(" "):
    if token not in target_tokens:
      target_tokens.add(token)
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_Decoder_tokens = len(target_tokens)

In [None]:
# creating input and target features dictionary
input_feature_dict = dict([(token, i) for i, token in enumerate(input_tokens)])
target_feature_dict = dict([(token, i) for i, token in enumerate(target_tokens)])

rev_input_feature_dict = dict((i, token) for token, i in input_feature_dict.items())
rev_target_feature_dict = dict((i, token) for token, i in target_feature_dict.items())
target_feature_dict

{'<END>': 0,
 '<START>': 1,
 'a': 2,
 'able': 3,
 'agr': 4,
 'aim': 5,
 'aimait': 6,
 'aime': 7,
 'aiment': 8,
 'aimez': 9,
 'aimons': 10,
 'all': 11,
 'amusant': 12,
 'animal': 13,
 'animaux': 14,
 'ao': 15,
 'au': 16,
 'automne': 17,
 'automobile': 18,
 'aux': 19,
 'avril': 20,
 'banane': 21,
 'bananes': 22,
 'beau': 23,
 'belle': 24,
 'blanc': 25,
 'blanche': 26,
 'bleu': 27,
 'bleue': 28,
 'california': 29,
 'californie': 30,
 'calme': 31,
 'camion': 32,
 'ce': 33,
 'cembre': 34,
 'cet': 35,
 'cette': 36,
 'chat': 37,
 'chaud': 38,
 'chaude': 39,
 'chaux': 40,
 'che': 41,
 'cher': 42,
 'ches': 43,
 'cheval': 44,
 'chevaux': 45,
 'chien': 46,
 'chiens': 47,
 'chine': 48,
 'citron': 49,
 'citrons': 50,
 'comme': 51,
 'conduisait': 52,
 'conduit': 53,
 'cours': 54,
 'd': 55,
 'de': 56,
 'dernier': 57,
 'des': 58,
 'difficile': 59,
 'doux': 60,
 'e': 61,
 'elle': 62,
 'en': 63,
 'enneig': 64,
 'entre': 65,
 'espagnol': 66,
 'est': 67,
 'et': 68,
 'f': 69,
 'fait': 70,
 'favori': 71,
 '

In [None]:
# maximum length in input and target docs
max_input_seq_length = max([len(input_doc) for input_doc in input_docs])
max_target_seq_length = max([len(target_doc) for target_doc in target_docs])

**We need three matrices of one-hot encoder to train our seq2seq model. one for encoder input, second for decoder input and third for decoder output.**

In [None]:
encoder_input_data = np.zeros((len(input_docs), max_input_seq_length, num_encoder_tokens), dtype = 'float32')
decoder_input_data = np.zeros((len(target_docs), max_target_seq_length, num_Decoder_tokens), dtype = 'float32')
decoder_output_data = np.zeros((len(target_docs), max_target_seq_length, num_Decoder_tokens), dtype = 'float32')


In [None]:
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
  for timestep,token in enumerate(input_doc.split()):
    encoder_input_data[line, timestep, input_feature_dict[token]] = 1
  for timestep, token in enumerate(target_doc.split()):
    decoder_input_data[line, timestep, target_feature_dict[token]] = 1
    if timestep > 0:
      decoder_output_data[line, timestep-1, target_feature_dict[token]] = 1

**TRAINING SETUP**

In [None]:
from tensorflow import keras 
from keras.models import Model
from keras.layers import Dense, LSTM, Input
dimensionality = 256

#encoder model
encoder_input = Input(shape = (None, num_encoder_tokens))
lstm = LSTM(dimensionality, return_state = True)
encoder_output, encoder_hidden_state, encoder_cell_state = lstm(encoder_input)
encoder_states = [encoder_hidden_state, encoder_cell_state]

#decoder_model
decoder_input = Input(shape = (None, num_Decoder_tokens))
lstm = LSTM(dimensionality, return_state = True, return_sequences = True)
decoder_output, decoder_state_hidden, decoder_cell_state = lstm(decoder_input, initial_state = encoder_states)
decoder_dense = Dense(num_Decoder_tokens, activation = 'softmax')
decoder_output = decoder_dense(decoder_output)

In [None]:
# using earlystopping function that tracks the val_loss, stops training if there is no change towards the val_loss.
earlystop = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 3, verbose = 1, restore_best_weights = True)
callbacks_list = [earlystop]

**BUILDING AND TRAINING ENCODER AND DECODER MODEL**

In [None]:
#model
training_model = Model([encoder_input, decoder_input], decoder_output)
# compiling
training_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
# fitiing the model
history = training_model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size = 256, epochs = 100, validation_split = 0.2)
#saving the model
training_model.save('training_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
#training_model = load_model('/content/drive/My Drive/MachineTranslation/training_model.h5')
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = lstm(decoder_input, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)



In [None]:
# model for testing 
def decode_response(test_input):
    #Getting the output states to pass into the decoder
    states_value = encoder_model.predict(test_input)
    #Generating empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    #Setting the first token of target sequence with the start token
    target_seq[0, 0, target_features_dict['<START>']] = 1.
    
    #A variable to store our response word by word
    decoded_sentence = ''
    
    stop_condition = False
    while not stop_condition:
      #Predicting output tokens with probabilities and states
      output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)
      #Choosing the one with highest probability
      sampled_token_index = np.argmax(output_tokens[0, -1, :])
      sampled_token = reverse_target_features_dict[sampled_token_index]
      decoded_sentence += " " + sampled_token#Stop if hit max length or found the stop token
      if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
        stop_condition = True
      #Update the target sequence
      target_seq = np.zeros((1, 1, num_decoder_tokens))
      target_seq[0, 0, sampled_token_index] = 1.
      #Update states
      states_value = [hidden_state, cell_state]
    return decoded_sentence

In [None]:
class Translator:
  exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")
  
  #Method to start the translator
  def start(self):
    user_response = input("Give in an English sentence. :) \n")
    self.translate(user_response)
  
  #Method to handle the conversation
  def translate(self, reply):
    while not self.make_exit(reply):
      reply = input(self.generate_response(reply)+"\n")#Method to convert user input into a matrix
  def string_to_matrix(self, user_input):
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros(
      (1, max_encoder_seq_length, num_encoder_tokens),
      dtype='float32')
    for timestep, token in enumerate(tokens):
      if token in input_features_dict:
        user_input_matrix[0, timestep, input_features_dict[token]] = 1.
    return user_input_matrix
  
  #Method that will create a response using seq2seq model we built
  def generate_response(self, user_input):
    input_matrix = self.string_to_matrix(user_input)
    chatbot_response = decode_response(input_matrix)
    #Remove <START> and <END> tokens from chatbot_response
    chatbot_response = chatbot_response.replace("<START>",'')
    chatbot_response = chatbot_response.replace("<END>",'')
    return chatbot_response
  
  #Method to check for exit commands
  def make_exit(self, reply):
    for exit_command in self.exit_commands:
      if exit_command in reply:
        print("Ok, have a great day!")
        return True
    return False
  
translator = Translator()