## Language Translator

In [5]:
import nltk
import pandas as pd
import numpy as np
import collections


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import sparse_categorical_crossentropy

from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

from keras.utils.vis_utils import plot_model
from numpy.random import rand
from numpy.random import shuffle

from keras.models import load_model

from nltk.translate.bleu_score import corpus_bleu

from numpy import argmax

import re

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [6]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


### Data exploration and retrival

In [7]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].words)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].mots)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].alignment)

['Resumption', 'of', 'the', 'session']
['Reprise', 'de', 'la', 'session']
0-0 1-1 2-2 3-3


* data retrival function

In [8]:
def get_words(corpus_text):
    corpus = comtrans.aligned_sents(corpus_text)
    language_1 = [i.words for i in corpus]
    language_2 = [i.mots for i in corpus]
    return language_1, language_2

In [9]:
language_1 = get_words('alignment-en-fr.txt')[0]
language_2 = get_words('alignment-en-fr.txt')[1]

df_lang_1 = pd.DataFrame({'sentence':language_1})
df_lang_2 = pd.DataFrame({'sentence':language_2})

In [10]:
df_lang_all = pd.concat([df_lang_1, df_lang_2], ignore_index=True)

* data cleaning function for dataframes

In [11]:
def preprocess(df):
    
    #remove all tokens that are not alphanumeric
    words = df['sentence'].apply(lambda x: [i for i in x if i.isalnum()])
    
    # STEP 3: convert text to lowercase
    words = words.apply(lambda x: [i.lower() for i in x])
    
    return words
clean_l1 = preprocess(df_lang_1).tolist()
clean_l2 = preprocess(df_lang_2).tolist()
clean_all = preprocess(df_lang_all).tolist()

In [12]:
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

In [13]:
l1 = list(map(listToString, clean_l1))
l2 = list(map(listToString, clean_l2))
lang_all = list(map(listToString, clean_all))

* filtering for long sentences

In [14]:
# using regex (findall()) function
res = len(re.findall(r'\w+', l1[3]))
# total no of words
print ("The number of words in string are : " + str(res))

The number of words in string are : 8


In [15]:
def filter_sentence_length(sentences_l1, sentences_l2):
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i  in range(len(l1)): 
        if 0 <= len(re.findall(r'\w+', sentences_l1[i])) <= 20 and 0 <= len(re.findall(r'\w+', sentences_l2[i])) <= 20:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
            
    return filtered_sentences_l1,filtered_sentences_l2

In [16]:
data_l1, data_l2 = filter_sentence_length(l1, l2)
print("# Filtered Corpora length (i.e. number of sentences)")
print(len(data_l1))
assert len(data_l1) == len(data_l2)

# Filtered Corpora length (i.e. number of sentences)
17748


* combined l1 and l2

In [17]:
dataset = np.array(list(zip(data_l1, data_l2)))

* reduce size of dataset + train/test split

In [18]:
# reduce dataset size
n_sentences = int(6500)
split_size = int(n_sentences * 0.8)

data_lang = dataset[:n_sentences, :]
# random shuffle
shuffle(data_lang)

# split into train/test
train, test = data_lang[:split_size], data_lang[split_size:]

* word vectorization and padding

In [19]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer

In [20]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:
#l1 tokenizer
l1_tokenizer = tokenize(data_lang[:, 0])
l1_vocab_size = len(l1_tokenizer.word_index) + 1
l1_length = max_length(data_lang[:, 0])
print('English Vocabulary Size: %d' % l1_vocab_size)
print('English Max Length: %d' % (l1_length))

# l2 tokenizer
l2_tokenizer = tokenize(data_lang[:, 1])
l2_vocab_size = len(l2_tokenizer.word_index) + 1
l2_length = max_length(data_lang[:, 1])
print('French Vocabulary Size: %d' % l2_vocab_size)
print('French Max Length: %d' % (l2_length))

In [21]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [22]:
def encode_output(sequences, vocab_size):
	ylist = []
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

* populate training and splitting

In [None]:
# prepare training data
trainX = encode_sequences(l1_tokenizer, l1_length, train[:, 0])
trainY = encode_sequences(l2_tokenizer, l2_length, train[:, 1])
#trainY = encode_output(trainY, l2_vocab_size)

# prepare validation data
testX = encode_sequences(l1_tokenizer, l1_length, test[:, 0])
testY = encode_sequences(l2_tokenizer, l2_length, test[:, 1])
#testY = encode_output(testY, l2_vocab_size)

In [None]:
from tensorflow.keras.utils import Sequence

class SentenceGenerator (Sequence):
    def __init__(self, X, y, vocab_size, batch_size, to_fit=True):
        self.X = X
        self.y = y
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.on_epoch_end()
        self.to_fit = to_fit
        
        #self.shuffle = shuffle
    def __len__(self):
        return len(self.y) // self.batch_size
    
    def __getitem__(self, item):
        X = self.X[item * self.batch_size : (item + 1) * self.batch_size]
        if self.to_fit:
            y = self.y[item * self.batch_size : (item + 1) * self.batch_size]
            y = encode_output(y, self.vocab_size)
            return X, y
        
        else:
            return X


In [None]:
testdataset = SentenceGenerator(testX, testY, l2_vocab_size, 32)
traindataset = SentenceGenerator(trainX, trainY, l2_vocab_size, 32)

* modeling 

In [None]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(Dropout(0.2))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [None]:
# define model
model = define_model(l1_vocab_size, l2_vocab_size, l1_length, l2_length, 256)

model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics=['accuracy'])
# summarize defined model
print(model.summary())

In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(traindataset, validation_data=testdataset, epochs=17, batch_size=64, callbacks=[checkpoint], verbose=2)

## Model evaluation

* creating source and doing preprocessing

In [23]:
data_lang_test = dataset[5000:5010, :]
# random shuffle
shuffle(data_lang_test)
# split into train/test
train_val, test_val = data_lang_test[:1], data_lang_test[:2]

In [24]:
#l1 tokenizer
l1_tokenizer = tokenize(data_lang_test[:, 0])
l1_vocab_size = len(l1_tokenizer.word_index) + 1
l1_length = max_length(data_lang_test[:, 0])

# l2 tokenizer
l2_tokenizer = tokenize(data_lang_test[:, 1])
l2_vocab_size = len(l2_tokenizer.word_index) + 1
l2_length = max_length(data_lang_test[:, 1])

# prepare data
trainX = encode_sequences(l1_tokenizer, l1_length, train_val[:, 0])
testX = encode_sequences(l1_tokenizer, l1_length, test_val[:, 0])

# load model
model = load_model('model.h5') 

* evaluation

In [27]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [28]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [50]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, l2_tokenizer, source)
        raw_src, raw_target = raw_dataset[i]
        print(raw_dataset)
        print(raw_scr)
        print(raw_targe)
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
            actual.append([raw_target.split()])
            predicted.append(translation.split())
	# calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [62]:
# evaluate the skill of the model
def evaluate_model1(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, l2_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        print(raw_dataset)
        print(raw_src)
        print(raw_target)
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
            actual.append([raw_target.split()])
            predicted.append(translation.split())
	# calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [52]:
# test on some training sequences
print('train')
evaluate_model(model, l2_tokenizer, trainX, train_val)

train
[['these are the modern means to be used'
  'nous allons utiliser tous ces moyens modernes']]
src=[these are the modern means to be used], target=[nous allons utiliser tous ces moyens modernes], predicted=[allons des des de de]
BLEU-1: 0.134064
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


In [63]:
# test on some training sequences
print('train')
evaluate_model1(model, l2_tokenizer, trainX, train_val)

train
[['these are the modern means to be used'
  'nous allons utiliser tous ces moyens modernes']]
nous allons utiliser tous ces moyens modernes
these are the modern means to be used
src=[nous allons utiliser tous ces moyens modernes], target=[these are the modern means to be used], predicted=[allons des des de de]
BLEU-1: 0.000000
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


In [53]:
# test on some test sequences
print('test')
evaluate_model(model, l2_tokenizer, testX, test_val)

test
[['these are the modern means to be used'
  'nous allons utiliser tous ces moyens modernes']
 ['it is shameful' 'est une honte']]
src=[these are the modern means to be used], target=[nous allons utiliser tous ces moyens modernes], predicted=[allons des des de de]
[['these are the modern means to be used'
  'nous allons utiliser tous ces moyens modernes']
 ['it is shameful' 'est une honte']]
src=[it is shameful], target=[est une honte], predicted=[une des]
BLEU-1: 0.186125
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
