In [1]:
import random
import string
import re
from pickle import dump
from pickle import load
from unicodedata import normalize
import numpy
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import SimpleRNN,GRU,LSTM
from keras.layers import Dense, Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_accuracy
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Fetching and decompressing the dataset 
!!curl -O http://www.manythings.org/anki/fra-eng.zip
!!unzip fra-eng.zip

['Archive:  fra-eng.zip',
 'replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL',
 '(EOF or read error, treating as "[N]one" ...)']

In [3]:
# Loading the text data preserving the Unicode french characters

def load_doc(filename):
    
	# opening the text file in read only mode with unicode encoding 
	file = open(filename, mode='rt', encoding='utf-8')
    
	# reading the text from the opened file
	text = file.read()
    
	# finally closing the file
	file.close()
    
	return text

In [4]:
# Each line of the text file contain English sentence and its French translation seperated by tab character.  

def to_pairs(doc):
    
    # Obtaining each line in the file
	lines = doc.strip().split('\n')
    
    # Obtaining the pairs of English sentence and its french translation
	pairs = [line.split('\t') for line in  lines]
    
	return pairs

In [5]:
# Cleaning the lines by removing all the non-printable characters, punctuation characters
# clean a list of lines
def clean_pairs(lines):
    
	cleaned = list()
    
	# using regular expression for removing non-printable characters
	re_print = re.compile('[^%s]' % re.escape(string.printable))
    
	# using regular expression for removing punctuation characters and obtaining translation table
	table = str.maketrans('', '', string.punctuation)
    
	for pair in lines:
        
		clean_pair = list()
		for line in pair:
            
			# normalization to remove canonical and compatibility related issues
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
            
			# tokenizing the white space
			line = line.split()
            
			# normalizing the text to lowercase
			line = [word.lower() for word in line]
            
			# removing punctuation from each token using regular expression table 
			line = [word.translate(table) for word in line]
            
			# removing non-printable characters using the above regular expression
			line = [re_print.sub('', w) for w in line]
            
			# removing the non-alphabetic tokens such as numbers
			line = [word for word in line if word.isalpha()]
            
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [6]:
# saving the list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [7]:
# loading the text dataset
filename = 'fra.txt'
doc = load_doc(filename)

In [8]:
# split the lines in the text file into english-french pairs
pairs = to_pairs(doc)

In [9]:
# cleaning the sentences
clean_pairs = clean_pairs(pairs)

In [10]:
# saving the cleaned pairs to file
save_clean_data(clean_pairs, 'eng-fre.pkl')

Saved: eng-fre.pkl


In [11]:
# Total number of translation sentences
print("There a total of {} pairs of tranlations in the dataset".format(clean_pairs.shape[0]))

There a total of 190206 pairs of tranlations in the dataset


In [12]:
# Looking at few sentences

rand_list = random.sample(range(0, clean_pairs.shape[0]), 30)
for i in rand_list:
	print('English : %s\nFrench : %s \n' % (clean_pairs[i,0], clean_pairs[i,1]))

English : turn to the right
French : tournez a droite sil vous plait 

English : why did you stay home yesterday
French : pourquoi estu reste chez toi hier 

English : this is irrelevant
French : cest hors de propos 

English : i lost my balance
French : jai perdu lequilibre 

English : why do you say one thing and then go and do another
French : pourquoi distu une chose et ensuite tu y vas et tu fais autre chose 

English : i am the same age
French : je suis du meme age 

English : dont change a thing
French : ne changez rien 

English : give me a sec
French : donnemoi une seconde 

English : i thought it was very easy
French : je pensais que cetait du gateau 

English : his house was struck by lightning
French : sa maison a ete frappee par la foudre 

English : well take it
French : nous le prendrons 

English : youre being weird
French : tes pas normal 

English : whatever you do dont blink
French : quoi que tu fasses ne cligne pas les yeux 

English : its not irrelevant
French : ce

In [13]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

In [14]:
# loading the previously cleaned data 
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [15]:
# saving the cleaned sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [16]:
# loading the raw dataset
raw_dataset = load_clean_sentences('eng-fre.pkl')

In [17]:
#There are over 190000 pairs of sentences, it will take long time for training and testing the model
# Hence dataset size is reduced
n_sentences = 50000 #clean_pairs.shape[0]
dataset = raw_dataset[:n_sentences, :]
train_size = numpy.rint(0.7 * n_sentences)
validation_size = numpy.rint(0.1 * n_sentences)
test_size = numpy.rint(0.2 * n_sentences)

# randomly shuffling the dataset
shuffle(dataset)

# spliting the reduced dataset into train, validation and test
split_1 = int(train_size)
split_2 = int(train_size+validation_size)
split_3 = int(train_size+validation_size+test_size)
train, validation, test = dataset[:split_1], dataset[split_1:split_2], dataset[split_2:split_3]

In [18]:
train.shape, validation.shape, test.shape

((35000, 3), (5000, 3), (10000, 3))

In [19]:
# saving the reduced dataset to training, validation and testing data

save_clean_data(dataset, 'eng-fre-total.pkl')
save_clean_data(train, 'eng-fre-train.pkl')
save_clean_data(validation, 'eng-fre-validation.pkl')
save_clean_data(test, 'eng-fre-test.pkl')

Saved: eng-fre-total.pkl
Saved: eng-fre-train.pkl
Saved: eng-fre-validation.pkl
Saved: eng-fre-test.pkl


In [20]:
# loading the previously cleaned dataset 

def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [21]:
# Using keras tokenize class, for mapping the words to integers needed for modeling

def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [22]:
# Obtaining the maximum sentence length from the list of phrases 

def max_length(lines):
	return max(len(line.split()) for line in lines)

In [23]:
# encoding and padding the input and output sequences

def encode_sequences(tokenizer, length, lines):
	# Each input and output sequence are  encoded to integers
	X = tokenizer.texts_to_sequences(lines)
	# Obtained sequences are padded with 0 values at the end to make their lenght as maxmim phrase length
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [24]:
# The output target sequences (English sentences) has to be one hot encoded as the model will 
# predicts probability of each word in the vocabulary as output. 

def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [25]:
# reverse mapping an predicted sequence of integers to a words by looking up tokenizer

def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [26]:
# mapping the sequence of integers for generating string of words

def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [27]:
# Evaluating the performance of each model using BLEU score by comparing predicted result to original/expected sequences 


def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	bleu_scores = []
	for i, source in enumerate(sources):
		# translating the encoded input text sequence
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i][0], raw_dataset[i][1]
        
        # Printing 50 French to English translations by the model
        
		if i < 50:
			print('French(Source) : %s\nTarget : %s\nPredicted : %s \n' % (raw_src, raw_target, translation))
        
        
		actual.append([raw_target.split()])
		predicted.append(translation.split())
        
    # Calculating BLEU score
	bleu_1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
	bleu_2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
	bleu_3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
	bleu_4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
    
    # Saving BLEU scores to return 
	bleu_scores = [bleu_1, bleu_2, bleu_3, bleu_4]
    
                         
	# calculate BLEU score
	print('BLEU-1: %f' % bleu_1)
	print('BLEU-2: %f' % bleu_2)
	print('BLEU-3: %f' % bleu_3)
	print('BLEU-4: %f' % bleu_4)
    
	return bleu_scores

In [28]:
# loading the total, train, validation and test datasets 
dataset = load_clean_sentences('eng-fre-total.pkl')
train = load_clean_sentences('eng-fre-train.pkl')
valid = load_clean_sentences('eng-fre-validation.pkl')
test = load_clean_sentences('eng-fre-test.pkl')

In [29]:
# preparing the English tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))\

# preparing the French tokenizer
fre_tokenizer = create_tokenizer(dataset[:, 1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:, 1])
print('French Vocabulary Size: %d' % fre_vocab_size)
print('French Max Length: %d' % (fre_length))

English Vocabulary Size: 6010
English Max Length: 7
French Vocabulary Size: 12184
French Max Length: 14


In [30]:
# Word index of English tokenizer
eng_tokenizer.word_index

{'i': 1,
 'you': 2,
 'tom': 3,
 'a': 4,
 'is': 5,
 'it': 6,
 'the': 7,
 'to': 8,
 'im': 9,
 'he': 10,
 'me': 11,
 'this': 12,
 'do': 13,
 'was': 14,
 'that': 15,
 'youre': 16,
 'are': 17,
 'we': 18,
 'have': 19,
 'dont': 20,
 'your': 21,
 'my': 22,
 'not': 23,
 'were': 24,
 'its': 25,
 'go': 26,
 'did': 27,
 'be': 28,
 'can': 29,
 'they': 30,
 'like': 31,
 'all': 32,
 'no': 33,
 'in': 34,
 'what': 35,
 'she': 36,
 'very': 37,
 'of': 38,
 'here': 39,
 'want': 40,
 'how': 41,
 'ill': 42,
 'get': 43,
 'on': 44,
 'thats': 45,
 'know': 46,
 'need': 47,
 'cant': 48,
 'up': 49,
 'one': 50,
 'for': 51,
 'him': 52,
 'out': 53,
 'so': 54,
 'at': 55,
 'now': 56,
 'good': 57,
 'just': 58,
 'love': 59,
 'come': 60,
 'please': 61,
 'help': 62,
 'has': 63,
 'there': 64,
 'too': 65,
 'why': 66,
 'theyre': 67,
 'look': 68,
 'who': 69,
 'got': 70,
 'us': 71,
 'hes': 72,
 'lets': 73,
 'will': 74,
 'see': 75,
 'take': 76,
 'his': 77,
 'had': 78,
 'am': 79,
 'let': 80,
 'well': 81,
 'think': 82,
 'stop': 8

In [31]:
# Word index of French tokenizer
fre_tokenizer.word_index

{'je': 1,
 'tom': 2,
 'pas': 3,
 'a': 4,
 'de': 5,
 'vous': 6,
 'ne': 7,
 'est': 8,
 'il': 9,
 'le': 10,
 'nous': 11,
 'la': 12,
 'suis': 13,
 'cest': 14,
 'jai': 15,
 'un': 16,
 'tu': 17,
 'que': 18,
 'en': 19,
 'me': 20,
 'une': 21,
 'ca': 22,
 'les': 23,
 'ce': 24,
 'etes': 25,
 'tout': 26,
 'elle': 27,
 'sont': 28,
 'fait': 29,
 'qui': 30,
 'estce': 31,
 'sommes': 32,
 'mon': 33,
 'ils': 34,
 'des': 35,
 'es': 36,
 'ma': 37,
 'te': 38,
 'faire': 39,
 'tres': 40,
 'nest': 41,
 'du': 42,
 'elles': 43,
 'cela': 44,
 'votre': 45,
 'se': 46,
 'bien': 47,
 'ete': 48,
 'y': 49,
 'besoin': 50,
 'ici': 51,
 'lair': 52,
 'veux': 53,
 'peux': 54,
 'pour': 55,
 'moi': 56,
 'personne': 57,
 'etesvous': 58,
 'etait': 59,
 'comment': 60,
 'ton': 61,
 'ou': 62,
 'pourquoi': 63,
 'aller': 64,
 'tous': 65,
 'avons': 66,
 'dans': 67,
 'toi': 68,
 'si': 69,
 'au': 70,
 'monde': 71,
 'sur': 72,
 'avec': 73,
 'sest': 74,
 'maintenant': 75,
 'faut': 76,
 'plus': 77,
 'jaime': 78,
 'va': 79,
 'cette': 80,

In [32]:
# preparing the training data
trainX = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# preparing the validation data
validX = encode_sequences(fre_tokenizer, fre_length, valid[:, 1])
validY = encode_sequences(eng_tokenizer, eng_length, valid[:, 0])
validY = encode_output(validY, eng_vocab_size)

# preparing testing data
testX = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [33]:
# Model 1: Both Encoder GRU and Decoder GRU cell
def both_gru(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(GRU(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(GRU(n_units,return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [34]:
# define model

learning_rate = 1e-2
both_gru = both_gru(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
both_gru.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [35]:
# summarize defined model
print(both_gru.summary())
plot_model(both_gru, to_file='gru_model.png', show_shapes=True)
# fit model
filename = 'both_gru.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history_both_gru = both_gru.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(validX, validY), callbacks=[checkpoint], verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 14, 256)           3119104   
_________________________________________________________________
gru (GRU)                    (None, 256)               394752    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 7, 256)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 7, 256)            394752    
_________________________________________________________________
time_distributed (TimeDistri (None, 7, 6010)           1544570   
Total params: 5,453,178
Trainable params: 5,453,178
Non-trainable params: 0
_________________________________________________________________
None
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.g

In [36]:
df_history_both_gru = pd.DataFrame(history_both_gru.history)
df_history_both_gru.to_csv('df_history_both_gru.csv')
print(df_history_both_gru)

        loss  categorical_accuracy  val_loss  val_categorical_accuracy
0   3.163437              0.545771  2.693422                  0.597029
1   2.411328              0.618302  2.337207                  0.630629
2   2.045118              0.646539  2.150177                  0.645971
3   1.802673              0.663731  2.045769                  0.653343
4   1.639592              0.677837  1.992630                  0.662200
5   1.525113              0.686392  1.964557                  0.667086
6   1.426029              0.697498  1.958693                  0.656943
7   1.356985              0.705510  1.932390                  0.673886
8   1.299796              0.712890  1.909459                  0.671943
9   1.255346              0.718588  1.906954                  0.671571
10  1.211441              0.724588  1.917022                  0.675886
11  1.177698              0.729151  1.907092                  0.674914
12  1.148300              0.733567  1.906054                  0.679914
13  1.

In [37]:
# load model
both_gru = load_model('both_gru.h5')
# test on some training sequences
print('train')
bleu_train_both_gru = evaluate_model(both_gru, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
bleu_test_both_gru = evaluate_model(both_gru, eng_tokenizer, testX, test)

train
French(Source) : jai debranche la tele
Target : i unplugged the tv
Predicted : i unplugged the tv 

French(Source) : je suis desolee davoir dit cela
Target : im sorry i said that
Predicted : im sorry i said that 

French(Source) : jenfilai une chemise blanche
Target : i wore a white shirt
Predicted : i i a a pants 

French(Source) : il etait hors dhaleine
Target : he was out of breath
Predicted : it was in race 

French(Source) : cest un jour venteux
Target : its a windy day
Predicted : its a windy day 

French(Source) : je suis assez content
Target : im happy enough
Predicted : im too with 

French(Source) : amusonsnous
Target : lets have fun
Predicted : lets have fun 

French(Source) : questce qui vous trouble
Target : whats troubling you
Predicted : what troubling you 

French(Source) : cest impossible
Target : its impossible
Predicted : thats impossible 

French(Source) : personne ne change jamais
Target : no one ever changes
Predicted : no one changes 

French(Source) : ecou

In [38]:
# Model 2: Both Encoder LSTM and Decoder LSTM cell
def both_lstm(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	learning_rate = 1e-2
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units,return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [39]:
# define model

learning_rate = 1e-2
both_lstm = both_lstm(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
both_lstm.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [40]:
# summarize defined model
print(both_lstm.summary())
plot_model(both_lstm, to_file='both_lstm.png', show_shapes=True)
# fit model
filename = 'both_lstm.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history_both_lstm = both_lstm.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(validX, validY), callbacks=[checkpoint], verbose=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 256)           3119104   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 7, 6010)           1544570   
Total params: 5,714,298
Trainable params: 5,714,298
Non-trainable params: 0
_________________________________________________________________
None
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz

In [41]:
df_history_both_lstm = pd.DataFrame(history_both_lstm.history)
df_history_both_lstm
df_history_both_lstm.to_csv('df_history_both_lstm.csv')

In [42]:
# load model
both_lstm = load_model('both_lstm.h5')
# test on some training sequences
print('train')
bleu_train_both_lstm = evaluate_model(both_lstm, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
bleu_test_both_lstm = evaluate_model(both_lstm, eng_tokenizer, testX, test)

train
French(Source) : jai debranche la tele
Target : i unplugged the tv
Predicted : i unplugged the tv 

French(Source) : je suis desolee davoir dit cela
Target : im sorry i said that
Predicted : im sorry i said that 

French(Source) : jenfilai une chemise blanche
Target : i wore a white shirt
Predicted : i a a flat 

French(Source) : il etait hors dhaleine
Target : he was out of breath
Predicted : he was in a of 

French(Source) : cest un jour venteux
Target : its a windy day
Predicted : its a real 

French(Source) : je suis assez content
Target : im happy enough
Predicted : im happy happy 

French(Source) : amusonsnous
Target : lets have fun
Predicted : lets coming some 

French(Source) : questce qui vous trouble
Target : whats troubling you
Predicted : whats troubling you 

French(Source) : cest impossible
Target : its impossible
Predicted : thats is 

French(Source) : personne ne change jamais
Target : no one ever changes
Predicted : no one stopped 

French(Source) : ecoutetoi par

In [43]:
# Model 3:  Encoder GRU and Decoder LSTM cell

def gru_lstm(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	learning_rate = 1e-2
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(GRU(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units,return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [44]:
# define model

learning_rate = 1e-2
gru_lstm = gru_lstm(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
gru_lstm.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [45]:
# summarize defined model
print(gru_lstm.summary())
plot_model(gru_lstm, to_file='gru_lstm.png', show_shapes=True)
# fit model
filename = 'gru_lstm.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history_gru_lstm = gru_lstm.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(validX, validY), callbacks=[checkpoint], verbose=2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 14, 256)           3119104   
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               394752    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 7, 6010)           1544570   
Total params: 5,583,738
Trainable params: 5,583,738
Non-trainable params: 0
_________________________________________________________________
None
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz

In [46]:
df_history_gru_lstm = pd.DataFrame(history_both_lstm.history)
df_history_gru_lstm.to_csv('df_history_gru_lstm.csv')
print(df_history_gru_lstm)

        loss  categorical_accuracy  val_loss  val_categorical_accuracy
0   3.252434              0.535098  2.765865                  0.575057
1   2.497374              0.601127  2.447517                  0.613943
2   2.123063              0.636559  2.207898                  0.641400
3   1.850369              0.660959  2.089215                  0.651429
4   1.630965              0.681522  2.006674                  0.662886
5   1.456967              0.699527  1.970055                  0.668371
6   1.337380              0.712902  1.926979                  0.672800
7   1.223832              0.727918  1.920150                  0.677314
8   1.142345              0.739245  1.918651                  0.680000
9   1.073350              0.750331  1.911030                  0.680629
10  1.012037              0.760049  1.920534                  0.680771
11  0.960292              0.769069  1.928720                  0.677171
12  0.922774              0.775514  1.929744                  0.683429
13  0.

In [47]:
# load model
gru_lstm = load_model('gru_lstm.h5')
# test on some training sequences
print('train')
bleu_train_gru_lstm = evaluate_model(gru_lstm, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
bleu_test_gru_lstm = evaluate_model(gru_lstm, eng_tokenizer, testX, test)

train
French(Source) : jai debranche la tele
Target : i unplugged the tv
Predicted : i have the tv 

French(Source) : je suis desolee davoir dit cela
Target : im sorry i said that
Predicted : im sorry i that 

French(Source) : jenfilai une chemise blanche
Target : i wore a white shirt
Predicted : i was a a 

French(Source) : il etait hors dhaleine
Target : he was out of breath
Predicted : he was very at 

French(Source) : cest un jour venteux
Target : its a windy day
Predicted : its a a huge 

French(Source) : je suis assez content
Target : im happy enough
Predicted : im pretty enough 

French(Source) : amusonsnous
Target : lets have fun
Predicted : lets have fun fun 

French(Source) : questce qui vous trouble
Target : whats troubling you
Predicted : what troubling you 

French(Source) : cest impossible
Target : its impossible
Predicted : this impossible 

French(Source) : personne ne change jamais
Target : no one ever changes
Predicted : no one ever 

French(Source) : ecoutetoi parler

In [48]:
# Model 4:  Encoder LSTM and Decoder GRU cell

def lstm_gru(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	learning_rate = 1e-2
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(GRU(n_units,return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [49]:
# define model

learning_rate = 1e-2
lstm_gru = lstm_gru(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
lstm_gru.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [50]:
# summarize defined model
print(lstm_gru.summary())
plot_model(lstm_gru, to_file='lstm_gru.png', show_shapes=True)

# fit model
filename = 'lstm_gru.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history_lstm_gru = lstm_gru.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(validX, validY), callbacks=[checkpoint], verbose=2)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 14, 256)           3119104   
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
gru_3 (GRU)                  (None, 7, 256)            394752    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 7, 6010)           1544570   
Total params: 5,583,738
Trainable params: 5,583,738
Non-trainable params: 0
_________________________________________________________________
None
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz

In [51]:
df_history_lstm_gru = pd.DataFrame(history_both_lstm.history)
df_history_lstm_gru.to_csv('df_history_lstm_gru.csv')
print(df_history_lstm_gru)

        loss  categorical_accuracy  val_loss  val_categorical_accuracy
0   3.252434              0.535098  2.765865                  0.575057
1   2.497374              0.601127  2.447517                  0.613943
2   2.123063              0.636559  2.207898                  0.641400
3   1.850369              0.660959  2.089215                  0.651429
4   1.630965              0.681522  2.006674                  0.662886
5   1.456967              0.699527  1.970055                  0.668371
6   1.337380              0.712902  1.926979                  0.672800
7   1.223832              0.727918  1.920150                  0.677314
8   1.142345              0.739245  1.918651                  0.680000
9   1.073350              0.750331  1.911030                  0.680629
10  1.012037              0.760049  1.920534                  0.680771
11  0.960292              0.769069  1.928720                  0.677171
12  0.922774              0.775514  1.929744                  0.683429
13  0.

In [52]:
# load model
lstm_gru = load_model('lstm_gru.h5')

# test on some training sequences
print('train')
bleu_train_lstm_gru = evaluate_model(lstm_gru, eng_tokenizer, trainX, train)

# test on some test sequences
print('test')
bleu_test_lstm_gru = evaluate_model(lstm_gru, eng_tokenizer, testX, test)

train
French(Source) : jai debranche la tele
Target : i unplugged the tv
Predicted : i accepted the tv 

French(Source) : je suis desolee davoir dit cela
Target : im sorry i said that
Predicted : im sorry sorry said that 

French(Source) : jenfilai une chemise blanche
Target : i wore a white shirt
Predicted : he is a shirt shirt 

French(Source) : il etait hors dhaleine
Target : he was out of breath
Predicted : he was in to 

French(Source) : cest un jour venteux
Target : its a windy day
Predicted : this a a a 

French(Source) : je suis assez content
Target : im happy enough
Predicted : im quite enough 

French(Source) : amusonsnous
Target : lets have fun
Predicted : lets have in 

French(Source) : questce qui vous trouble
Target : whats troubling you
Predicted : what troubling you 

French(Source) : cest impossible
Target : its impossible
Predicted : its is 

French(Source) : personne ne change jamais
Target : no one ever changes
Predicted : no one ever 

French(Source) : ecoutetoi pa