SOURCE: https://towardsdatascience.com/implementing-neural-machine-translation-using-keras-8312e4844eb8

In [0]:
# Import the required libraries
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

### Read the data from the file
Read the file that contains the English-Spanish translations that we downloaded from the here
http://www.manythings.org/anki/

In [0]:
 # Path to the data txt file on disk.
#data_path = "\\NMT\\spa-eng\\spa.txt"
data_path = "spa.txt"
#data_path = "ben.txt"
# open the file eng-spa.txt and read
lines= pd.read_table(data_path,  names =['source', 'target', 'comments'])
#printing sample data from lines
lines.sample(6)

Unnamed: 0,source,target,comments
2612,You made it.,Lo han conseguido ustedes.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
42683,It'll be dark in an hour.,Se oscurecerá en una hora.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
73505,Tom has been seeing a therapist.,Tom ha estado visitando a un terapeuta.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
26414,This is only for you.,Esto es solo para ti.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
36880,He isn't richer than me.,Él no es más rico que yo.,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
37727,I saw the car hit a man.,Vi al coche golpear a un hombre.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


### Clean the source and target sentences.

We apply the following text cleaning

Convert text to lower case

Remove quotes

Remove all special characters like “@, !, *, $, #, ?, %, etc.”

Clean digits from the source and target sentences. If the source or the target language use different symbols for the numbers, then remove those symbols.

Remove spaces

In [0]:
# convert source and target text to Lowercase 
lines.source=lines.source.apply(lambda x: x.lower())
lines.target=lines.target.apply(lambda x: x.lower())
# Remove quotes from source and target text
lines.source=lines.source.apply(lambda x: re.sub("'", '', x))
lines.target=lines.target.apply(lambda x: re.sub("'", '', x))
# create a set of all special characters
special_characters= set(string.punctuation)
# Remove all the special characters
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in special_characters))
# Remove digits from source and target sentences
num_digits= str.maketrans('','', digits)
lines.source=lines.source.apply(lambda x: x.translate(num_digits))
lines.target= lines.target.apply(lambda x: x.translate(num_digits))
# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

In [0]:
# Add start and end tokens to target sequences
lines.target = lines.target.apply(lambda x : 'START_ '+ x + ' _END')
lines.sample(6)

Unnamed: 0,source,target,comments
40674,can you pass me the milk,START_ ¿me alcanza la leche _END,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
43663,this book belongs to you,START_ este libro es tuyo _END,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
33231,i promise i wont tell,START_ prometo que no voy a decir _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
66456,do you believe that god exists,START_ ¿crees que dios existe _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
53177,which one is your favorite,START_ ¿cuál es tu favorito _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
17154,i have few friends,START_ tengo pocos amigos _END,CC-BY 2.0 (France) Attribution: tatoeba.org #3...


In [0]:
# Create a set of unique words both for source and target language from the dataset and sort them alphabetically
# Find all the source and target words and sort them
# Vocabulary of Source language
all_source_words=set()
for source in lines.source:
    for word in source.split():
        if word not in all_source_words:
            all_source_words.add(word)
# Vocabulary of Target 
all_target_words=set()
for target in lines.target:
    for word in target.split():
        if word not in all_target_words:
            all_target_words.add(word)
# sort all unique source and target words
source_words= sorted(list(all_source_words))
target_words=sorted(list(all_target_words))

In [0]:
#Find maximum sentence length in  the source and target data
source_length_list=[]
for l in lines.source:
    source_length_list.append(len(l.split(' ')))
max_source_length= max(source_length_list)
print(" Max length of the source sentence",max_source_length)
target_length_list=[]
for l in lines.target:
    target_length_list.append(len(l.split(' ')))
max_target_length= max(target_length_list)
print(" Max length of the target sentence",max_target_length)

 Max length of the source sentence 10
 Max length of the target sentence 16


In [0]:
# creating a word to index(word2idx) for source and target
source_word2idx= dict([(word, i+1) for i,word in enumerate(source_words)])
target_word2idx=dict([(word, i+1) for i, word in enumerate(target_words)])

In [0]:
print(source_word2idx)
#source_word2idx



In [0]:
print(target_word2idx)
#target_word2idx

{'START_': 1, '_END': 2, 'a': 3, 'aabe': 4, 'aah': 5, 'aaron': 6, 'abajo': 7, 'abandona': 8, 'abandonada': 9, 'abandonado': 10, 'abandonados': 11, 'abandonamos': 12, 'abandonan': 13, 'abandonar': 14, 'abandonaremos': 15, 'abandonarlos': 16, 'abandonaron': 17, 'abandonaré': 18, 'abandonaría': 19, 'abandonarías': 20, 'abandonas': 21, 'abandonaste': 22, 'abandonen': 23, 'abandono': 24, 'abandoné': 25, 'abandonó': 26, 'abandónalo': 27, 'abanicando': 28, 'abanico': 29, 'abarrotada': 30, 'abarrotado': 31, 'abarrotes': 32, 'abatido': 33, 'abatió': 34, 'abdicar': 35, 'abdicó': 36, 'abdomen': 37, 'abdominales': 38, 'abducido': 39, 'abecedario': 40, 'abeja': 41, 'abejas': 42, 'aberración': 43, 'abeto': 44, 'abierta': 45, 'abiertas': 46, 'abierto': 47, 'abiertos': 48, 'abnegada': 49, 'abocado': 50, 'abofeteó': 51, 'abogada': 52, 'abogado': 53, 'abogados': 54, 'abolir': 55, 'abolió': 56, 'abollado': 57, 'abordó': 58, 'aborrecemos': 59, 'aborreció': 60, 'aborrezco': 61, 'abotonarme': 62, 'abotonó':

In [0]:
#creating a dictionary for index to word for source and target vocabulary
source_idx2word= dict([(i, word) for word, i in  source_word2idx.items()])
print(source_idx2word)
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])



# Shuffle the data

Shuffling helps with.

Reducing variance.

Ensures models remain generic and overfit less.

Batches between epochs do not look alike.

Makes model more robust.

In [0]:
#Shuffle the data
lines = shuffle(lines)

## Creating training and test dataset


In [0]:
# Train - Test Split
X, y = lines.source, lines.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((76590,), (8511,))

## Create data for training the encoder-decoder model.

We will use fit_generator() instead of the fit() method as our data is too large to fit into the memory. fit_generator() needs an underlying function to generate the data.
We create the underlying function generate_batch() for generating data in batches

The fit_generator() will accept a batch of data from the underlying function, generate_batch()
To train a sequence to sequence model, we need to create one-hot encoded data for

encoder inputs: The 2D array will be of shape (batch_size, max source sentence length). For a batch_size of 128 and a max source sentence length of 47, the shape of encoder_input will be (128,47)

decoder inputs: The 2D array will be of shape (batch_size, max target sentence length). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder inputs will be (128,55)

decoder outputs: The 3D array will be of shape (batch_size, max target sentence length, number of unique words in target sentences). For a batch_size of 128 and a max target sentence length of 55, the shape of decoder output will be (128,55, 27200).

Number of unique words in the target_sentence is 27199 which we zero pad, and hence the third parameter in decoder output is 27200

In [0]:
# Input tokens for encoder
num_encoder_tokens=len(source_words)
# Input tokens for decoder zero padded
num_decoder_tokens=len(target_words) +1

In [0]:
#We now create the generator_batch function()
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_source_length),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_target_length),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_target_length, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = source_word2idx[word] 
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_word2idx[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, target_word2idx[word]] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [0]:
# Build the sequence to sequence model
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50
latent_dim=256

### Build the Encoder

In [0]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]





NameError: ignored

###  Build the Decoder

In [0]:
!pip install latentdim

[31mERROR: Could not find a version that satisfies the requirement latentdim (from versions: none)[0m
[31mERROR: No matching distribution found for latentdim[0m


In [0]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

NameError: ignored

In [0]:
# Define the model that takes encoder and decoder input 
# to output decoder_outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

NameError: ignored