## Language Translator

In [28]:
import nltk
import pandas as pd
import numpy as np
import collections


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import sparse_categorical_crossentropy

from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

from keras.utils.vis_utils import plot_model
from numpy.random import rand
from numpy.random import shuffle

from keras.models import load_model

from nltk.translate.bleu_score import corpus_bleu

from numpy import argmax


From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [29]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [30]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [31]:
comtrans

<AlignedCorpusReader in 'C:\\Users\\kevin\\AppData\\Roaming\\nltk_data\\corpora\\comtrans.zip/comtrans/'>

In [32]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

### Data exploration and retrival

In [33]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].words)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].mots)
print(comtrans.aligned_sents('alignment-en-fr.txt')[0].alignment)

['Resumption', 'of', 'the', 'session']
['Reprise', 'de', 'la', 'session']
0-0 1-1 2-2 3-3


* data retrival function

In [34]:
def get_words(corpus_text):
    corpus = comtrans.aligned_sents(corpus_text)
    language_1 = [i.words for i in corpus]
    language_2 = [i.mots for i in corpus]
    
    return language_1, language_2

In [35]:
language_1 = get_words('alignment-en-fr.txt')[0]
language_2 = get_words('alignment-en-fr.txt')[1]
    
df_lang_1 = pd.DataFrame({'sentence':language_1})
df_lang_2 = pd.DataFrame({'sentence':language_2})

In [36]:
df_lang_all = pd.concat([df_lang_1, df_lang_2], ignore_index=True)

* data cleaning function for dataframes

In [37]:
def preprocess(df):
    
    #remove all tokens that are not alphanumeric
    words = df['sentence'].apply(lambda x: [i for i in x if i.isalnum()])
    
    # STEP 3: convert text to lowercase
    words = words.apply(lambda x: [i.lower() for i in x])
    
    return words
clean_l1 = preprocess(df_lang_1).tolist()
clean_l2 = preprocess(df_lang_2).tolist()
clean_all = preprocess(df_lang_all).tolist()

In [38]:
def listToString(s): 
    
    # initialize an empty string
    str1 = " " 
    
    # return string  
    return (str1.join(s))

In [84]:
l1 = list(map(listToString, clean_l1))
l2 = list(map(listToString, clean_l2))
lang_all = list(map(listToString, clean_all))

'i declare resumed the session of the european parliament adjourned on friday 17 december 1999 and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period'

* decrease shape of sentences, DOES NOT WORK YET

In [89]:
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i in range(len(sentences_l1)):
        if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
            return filtered_sentences_l1, filtered_sentences_l2

In [90]:
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(l1, l2)
print("# Filtered Corpora length (i.e. number of sentences)")
print(len(filt_clean_sen_l1))
assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)

# Filtered Corpora length (i.e. number of sentences)
1


* combined l1 and l2

In [40]:
dataset = np.array(list(zip(l1, l2)))

* reduce size of dataset + train/test split

In [43]:
# reduce dataset size
n_sentences = int(7000)
split_size = int(n_sentences * 0.8)

data_lang = dataset[:n_sentences, :]
# random shuffle
shuffle(data_lang)
# split into train/test
train, test = data_lang[:split_size], data_lang[split_size:]

* word vectorization and padding

In [44]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer

In [45]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [46]:
#l1 tokenizer
l1_tokenizer = tokenize(data_lang[:, 0])
l1_vocab_size = len(l1_tokenizer.word_index) + 1
l1_length = max_length(data_lang[:, 0])
print('English Vocabulary Size: %d' % l1_vocab_size)
print('English Max Length: %d' % (l1_length))

# l2 tokenizer
l2_tokenizer = tokenize(data_lang[:, 1])
l2_vocab_size = len(l2_tokenizer.word_index) + 1
l2_length = max_length(data_lang[:, 1])
print('French Vocabulary Size: %d' % l2_vocab_size)
print('French Max Length: %d' % (l2_length))

English Vocabulary Size: 7685
English Max Length: 39
French Vocabulary Size: 10110
French Max Length: 38


In [69]:
from tempfile import mkdtemp
import os.path as path
filename = path.join(mkdtemp(), 'newfile.dat

SyntaxError: EOL while scanning string literal (<ipython-input-69-6e9a920945cd>, line 3)

In [47]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [48]:
def encode_output(sequences, vocab_size):
	ylist = []
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = np.array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

* populate training and splitting

In [49]:
# prepare training data
trainX = encode_sequences(l1_tokenizer, l1_length, train[:, 0])
trainY = encode_sequences(l2_tokenizer, l2_length, train[:, 1])
trainY = encode_output(trainY, l2_vocab_size)
# prepare validation data
testX = encode_sequences(l1_tokenizer, l1_length, test[:, 0])
testY = encode_sequences(l2_tokenizer, l2_length, test[:, 1])
testY = encode_output(testY, l2_vocab_size)

* modeling 

In [66]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(Dropout(0.2))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [67]:
# define model
model = define_model(l1_vocab_size, l2_vocab_size, l1_length, l2_length, 256)

model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics=['accuracy'])
# summarize defined model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 39, 256)           1967360   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
repeat_vector (RepeatVector) (None, 38, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 38, 256)           525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 38, 256)           0         
_________________________________________________________________
time_distributed (TimeDistri (None, 38, 10110)        

In [68]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=17, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/17
88/88 - 120s - loss: 4.7104 - accuracy: 0.5077 - val_loss: 3.9638 - val_accuracy: 0.5135

Epoch 00001: val_loss improved from inf to 3.96382, saving model to model.h5
Epoch 2/17
88/88 - 108s - loss: 3.8689 - accuracy: 0.5136 - val_loss: 3.7632 - val_accuracy: 0.5134

Epoch 00002: val_loss improved from 3.96382 to 3.76317, saving model to model.h5
Epoch 3/17
88/88 - 117s - loss: 3.6557 - accuracy: 0.5136 - val_loss: 3.6354 - val_accuracy: 0.5131

Epoch 00003: val_loss improved from 3.76317 to 3.63542, saving model to model.h5
Epoch 4/17
88/88 - 115s - loss: 3.5655 - accuracy: 0.5135 - val_loss: 3.5484 - val_accuracy: 0.5138

Epoch 00004: val_loss improved from 3.63542 to 3.54839, saving model to model.h5
Epoch 5/17
88/88 - 115s - loss: 3.5058 - accuracy: 0.5185 - val_loss: 3.5227 - val_accuracy: 0.5211

Epoch 00005: val_loss improved from 3.54839 to 3.52275, saving model to model.h5
Epoch 6/17
88/88 - 111s - loss: 3.4649 - accuracy: 0.5237 - val_loss: 3.5271 - val_accuracy: 0

<tensorflow.python.keras.callbacks.History at 0x270f37e1940>

## Model evaluation

* creating source and doing preprocessing

In [7]:
source ='the quick brown fox jumps over the lazy dog'

In [70]:
# load model
model = load_model('model.h5')

In [71]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [72]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [73]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, l2_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

* loading some clean data

In [74]:
data_lang_test = dataset[5000:5010, :]
# random shuffle
shuffle(data_lang_test)
# split into train/test
train, test = data_lang_test[:1], data_lang_test[1:]

In [75]:
#l1 tokenizer
l1_tokenizer = tokenize(data_lang_test[:, 0])
l1_vocab_size = len(l1_tokenizer.word_index) + 1
l1_length = max_length(data_lang_test[:, 0])

# l2 tokenizer
l2_tokenizer = tokenize(data_lang_test[:, 1])
l2_vocab_size = len(l2_tokenizer.word_index) + 1
l2_length = max_length(data_lang_test[:, 1])

# prepare data
trainX = encode_sequences(l1_tokenizer, l1_length, train[:, 1])
testX = encode_sequences(l1_tokenizer, l1_length, test[:, 1])

# load model
model = load_model('model.h5') 

In [76]:
# test on some training sequences
print('train')
evaluate_model(model, l2_tokenizer, trainX, train)

train
src=[la cohérence en matière application le risque une application incohérente des règles de concurrence doit être gardé à esprit mais je crois il ne doit pas être exagéré], target=[the question of consistent application the risk of inconsistent application of competition rules must be borne in mind but i do not believe that it should be exaggerated], predicted=[une]
BLEU-1: 0.000000
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


In [77]:
# test on some test sequences
print('test')
evaluate_model(model, l2_tokenizer, testX, test)

test
src=[récemment la commission a manifesté un intérêt renouvelé pour un impôt global comme la taxe tobin en tant que moyen de parvenir à une mondialisation on pourrait qualifier de socialement acceptable], target=[the commission recently observed renewed interest in a global tax such as the tobin tax as a means of achieving socially responsible globalisation so to speak], predicted=[une la]
src=[nombre des points généraux que le rapport de notre commission soulève ont été traités dans le rapport de mme van der laan], target=[many of the general points raised in our committee report have been covered in mrs van der laan report], predicted=[une la]
src=[la parole est à patten au nom de la commission], target=[mr patten has the floor on behalf of the commission], predicted=[une]
src=[la commission ne pense pas que les amendements puissent améliorer équilibre sans créer des tensions en même temps], target=[the commission does not consider that the amendments would improve the balance wi