## Language Translator

In [8]:
import nltk
import pandas as pd
import numpy as np
import collections


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import sparse_categorical_crossentropy

from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

from keras.utils.vis_utils import plot_model
from numpy.random import rand
from numpy.random import shuffle

from keras.models import load_model

from nltk.translate.bleu_score import corpus_bleu

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [4]:
comtrans

<AlignedCorpusReader in 'C:\\Users\\kevin\\AppData\\Roaming\\nltk_data\\corpora\\comtrans.zip/comtrans/'>

In [5]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

### Data exploration and retrival

In [6]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].words)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].mots)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].alignment)

['Resumption', 'of', 'the', 'session']
['Reprise', 'de', 'la', 'session']
0-0 1-1 2-2 3-3


* data retrival function

In [7]:
def get_words(corpus_text):
    corpus = comtrans.aligned_sents(corpus_text)
    language_1 = [i.words for i in corpus]
    language_2 = [i.mots for i in corpus]
    
    return language_1, language_2

In [8]:
language_1 = get_words('alignment-en-fr.txt')[0]
language_2 = get_words('alignment-en-fr.txt')[1]
    
df_lang_1 = pd.DataFrame({'sentence':language_1})
df_lang_2 = pd.DataFrame({'sentence':language_2})

In [9]:
df_lang_all = pd.concat([df_lang_1, df_lang_2], ignore_index=True)

* data cleaning function for dataframes

In [10]:
def preprocess(df):
    
    #remove all tokens that are not alphanumeric
    words = df['sentence'].apply(lambda x: [i for i in x if i.isalnum()])
    
    # STEP 3: convert text to lowercase
    words = words.apply(lambda x: [i.lower() for i in x])
    
    return words
clean_l1 = preprocess(df_lang_1).tolist()
clean_l2 = preprocess(df_lang_2).tolist()
clean_all = preprocess(df_lang_all).tolist()

In [11]:
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

In [12]:
l1 = list(map(listToString, clean_l1))
l2 = list(map(listToString, clean_l2))
lang_all = list(map(listToString, clean_all))

In [13]:
dataset = np.array(list(zip(l1, l2)))

In [14]:
dataset[0, :]

array(['resumption of the session', 'reprise de la session'],
      dtype='<U270')

In [15]:
dataset.shape

(33334, 2)

* reduce size of dataset + train/test split

In [29]:
# reduce dataset size
n_sentences = 5000
data_lang = dataset[:5000, :]
# random shuffle
shuffle(data_lang)
# split into train/test
train, test = data_lang[:4500], data_lang[4500:]

* word vectorization and padding

In [30]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer

In [31]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [32]:
#l1 tokenizer
l1_tokenizer = tokenize(data_lang[:, 0])
l1_vocab_size = len(l1_tokenizer.word_index) + 1
l1_length = max_length(data_lang[:, 0])
print('English Vocabulary Size: %d' % l1_vocab_size)
print('English Max Length: %d' % (l1_length))

# l2 tokenizer
l2_tokenizer = tokenize(data_lang[:, 1])
l2_vocab_size = len(l2_tokenizer.word_index) + 1
l2_length = max_length(data_lang[:, 1])
print('French Vocabulary Size: %d' % l2_vocab_size)
print('French Max Length: %d' % (l2_length))

English Vocabulary Size: 6623
English Max Length: 39
French Vocabulary Size: 8498
French Max Length: 38


In [33]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [34]:
def encode_output(sequences, vocab_size):
	ylist = []
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

* populate training and splitting

In [35]:
# prepare training data
trainX = encode_sequences(l1_tokenizer, l1_length, train[:, 0])
trainY = encode_sequences(l2_tokenizer, l2_length, train[:, 1])
trainY = encode_output(trainY, l2_vocab_size)
# prepare validation data
testX = encode_sequences(l1_tokenizer, l1_length, test[:, 0])
testY = encode_sequences(l2_tokenizer, l2_length, test[:, 1])
testY = encode_output(testY, l2_vocab_size)

* modeling 

In [36]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [10]:
# define model
model = define_model(l1_vocab_size, l2_vocab_size, l1_length, l2_length, 256)

model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics=['accuracy'])
# summarize defined model
print(model.summary())

NameError: name 'define_model' is not defined

In [39]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=17, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/17
71/71 - 76s - loss: 3.3942 - val_loss: 3.5311

Epoch 00001: val_loss improved from inf to 3.53113, saving model to model.h5
Epoch 2/17
71/71 - 76s - loss: 3.3736 - val_loss: 3.5434

Epoch 00002: val_loss did not improve from 3.53113
Epoch 3/17
71/71 - 76s - loss: 3.3460 - val_loss: 3.5117

Epoch 00003: val_loss improved from 3.53113 to 3.51166, saving model to model.h5
Epoch 4/17
71/71 - 76s - loss: 3.3206 - val_loss: 3.4947

Epoch 00004: val_loss improved from 3.51166 to 3.49472, saving model to model.h5
Epoch 5/17
71/71 - 77s - loss: 3.3038 - val_loss: 3.5063

Epoch 00005: val_loss did not improve from 3.49472
Epoch 6/17
71/71 - 75s - loss: 3.2959 - val_loss: 3.5047

Epoch 00006: val_loss did not improve from 3.49472
Epoch 7/17
71/71 - 77s - loss: 3.2899 - val_loss: 3.4991

Epoch 00007: val_loss did not improve from 3.49472
Epoch 8/17
71/71 - 75s - loss: 3.2795 - val_loss: 3.4999

Epoch 00008: val_loss did not improve from 3.49472
Epoch 9/17
71/71 - 76s - loss: 3.2643 - va

<tensorflow.python.keras.callbacks.History at 0x21283b43670>

* Model evaluation

In [7]:
source ='the quick brown fox jumps over the lazy dog'

In [3]:
# load model
model = load_model('model.h5')

In [4]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [5]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [9]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))