In [1]:
# Importing Dependencies
import string
import re
import numpy as np
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
from tensorflow import keras

In [2]:
# function to read raw text file
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

In [3]:
# split a text into sentences
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [4]:
data = read_text("fra.txt")
fra_eng = to_lines(data)
fra_eng = array(fra_eng)

In [5]:
len(fra_eng)

192341

In [6]:
fra_eng=fra_eng[:100000,:2]
fra_eng

array([['Go.', 'Va !'],
       ['Go.', 'Marche.'],
       ['Go.', 'Bouge !'],
       ...,
       ["I don't want to sing anymore.", 'Je ne veux plus chanter.'],
       ["I don't want to stay at home.",
        "Je n'ai pas envie de rester à la maison."],
       ["I don't want to stay at home.",
        'Je ne veux pas rester chez moi.']], dtype='<U349')

In [7]:
# Remove punctuation
fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]

fra_eng

array([['Go', 'Va '],
       ['Go', 'Marche'],
       ['Go', 'Bouge '],
       ...,
       ['I dont want to sing anymore', 'Je ne veux plus chanter'],
       ['I dont want to stay at home',
        'Je nai pas envie de rester à la maison'],
       ['I dont want to stay at home', 'Je ne veux pas rester chez moi']],
      dtype='<U349')

In [8]:
# convert text to lowercase
for i in range(len(fra_eng)):
    fra_eng[i,0] = fra_eng[i,0].lower()
    fra_eng[i,1] = fra_eng[i,1].lower()

fra_eng

array([['go', 'va '],
       ['go', 'marche'],
       ['go', 'bouge '],
       ...,
       ['i dont want to sing anymore', 'je ne veux plus chanter'],
       ['i dont want to stay at home',
        'je nai pas envie de rester à la maison'],
       ['i dont want to stay at home', 'je ne veux pas rester chez moi']],
      dtype='<U349')

In [9]:
# empty lists
eng_l = []
fra_l = []

# populate the lists with sentence lengths
for i in fra_eng[:,0]:
      eng_l.append(len(i.split()))

for i in fra_eng[:,1]:
      fra_l.append(len(i.split()))

In [10]:
print(max(eng_l),max(fra_l))

8 14


In [11]:
# function to build a tokenizer
def tokenization(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer

In [12]:
# prepare english tokenizer
eng_tokenizer = tokenization(fra_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = max(eng_l)
print('English Vocabulary Size: %d' % eng_vocab_size)

English Vocabulary Size: 9165


In [13]:
# prepare French tokenizer
fra_tokenizer = tokenization(fra_eng[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1

fra_length = max(fra_l)
print('Deutch Vocabulary Size: %d' % fra_vocab_size)

Deutch Vocabulary Size: 21655


In [14]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

## Model Building

In [15]:
from sklearn.model_selection import train_test_split

# split data into train and test set
train, test = train_test_split(fra_eng, test_size=0.1, random_state = 12)

In [16]:
# prepare training data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

# prepare validation data
testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [17]:
# build NMT model
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model

In [18]:
# model compilation
model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 512)

In [19]:
rms = keras.optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy',metrics=['acc'])

  super(RMSprop, self).__init__(name, **kwargs)


In [20]:
# train model
model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2, 
                  verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fd7e4c80810>

## Prediction

In [None]:
preds = model.predict(testX.reshape((testX.shape[0],testX.shape[1])))

In [22]:
new=[]
for i in range(len(preds)):
  new.append(argmax(preds[i],axis=1))


In [23]:
new=np.array(new)
new.shape

(10000, 8)

In [24]:
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

In [25]:
preds_text = []
for i in new:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], eng_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))

In [26]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})

In [30]:
# print 15 rows randomly
pred_df.sample(15)

Unnamed: 0,actual,predicted
2510,i just got here last week,i only here night
5436,he works in a bank,he is working in a bank
2975,you never asked why,you never you why
6330,the proof is trivial,the toilet is defective
2390,she loves shopping,she loves shopping
8948,he asked her some questions,he asked out a few
8860,everyone looked puzzled,everyone looked puzzled
5898,tom continued working,tom kept working
4595,we dont have a garden,we dont have a garden
9628,its getting bigger,he is from up


In [28]:
ref=[]
cand=[]
for i in range(len(test)):
    ref.append(test[i][0].split())
    cand.append(preds_text[i].split())

## Calculating BLUE Score

In [29]:
# two references for one document
from nltk.translate.bleu_score import corpus_bleu
score = corpus_bleu(ref, cand)
print(score)

0.5330622548958478


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
