## Train Neural Translation Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pickle import load
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.models import load_model
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [None]:
# load datasets
loc = 'drive/MyDrive/nmt_test/'

# load datasets
dataset = load_clean_sentences(loc + 'english-nepali-both.pkl')
train = load_clean_sentences(loc + 'english-nepali-train.pkl')
test = load_clean_sentences(loc + 'english-nepali-test.pkl')

In [None]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [None]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare nepali tokenizer
nep_tokenizer = create_tokenizer(dataset[:, 1])
nep_vocab_size = len(nep_tokenizer.word_index) + 1
nep_length = max_length(dataset[:, 1])
print('Nepali Vocabulary Size: %d' % nep_vocab_size)
print('Nepali Max Length: %d' % (nep_length))

English Vocabulary Size: 9285
English Max Length: 20
Nepali Vocabulary Size: 17085
Nepali Max Length: 20


In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [None]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [None]:
# prepare training data
trainX = encode_sequences(nep_tokenizer, nep_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(nep_tokenizer, nep_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)
train[1]

array(['epidemiological data revealed retrospectively that the index case of sars had a contact history with game animals',
       'महामारी सम्बन्धी डाटाले पूर्वव्यापी ढङ्गमा sars को सूचक केसमा खेल पशुहरूसँग सम्पर्क इतिहास भएको कुरा खुलासा गर्\u200dयो।'],
      dtype='<U1491')

In [None]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
# 	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(Embedding(src_vocab, 128, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# define model
model = define_model(nep_vocab_size, eng_vocab_size, nep_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
# print(model.summary())
# plot_model(model, to_file='model.png', show_shapes=True)

In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=500, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/500
83/83 - 20s - loss: 5.5600 - val_loss: 4.9666

Epoch 00001: val_loss improved from inf to 4.96661, saving model to model.h5
Epoch 2/500
83/83 - 6s - loss: 4.7489 - val_loss: 4.8276

Epoch 00002: val_loss improved from 4.96661 to 4.82763, saving model to model.h5
Epoch 3/500
83/83 - 6s - loss: 4.6165 - val_loss: 4.7543

Epoch 00003: val_loss improved from 4.82763 to 4.75429, saving model to model.h5
Epoch 4/500
83/83 - 6s - loss: 4.6213 - val_loss: 4.7638

Epoch 00004: val_loss did not improve from 4.75429
Epoch 5/500
83/83 - 6s - loss: 4.4544 - val_loss: 4.6634

Epoch 00005: val_loss improved from 4.75429 to 4.66340, saving model to model.h5
Epoch 6/500
83/83 - 6s - loss: 4.3055 - val_loss: 4.6791

Epoch 00006: val_loss did not improve from 4.66340
Epoch 7/500
83/83 - 6s - loss: 4.2549 - val_loss: 4.6772

Epoch 00007: val_loss did not improve from 4.66340
Epoch 8/500
83/83 - 6s - loss: 4.2172 - val_loss: 4.6796

Epoch 00008: val_loss did not improve from 4.66340
Epoch 9/500

<keras.callbacks.History at 0x7f2ba0326a50>

In [None]:
loc = 'drive/MyDrive/nmt_test/'

# load datasets
dataset = load_clean_sentences(loc + 'english-nepali-both.pkl')
train = load_clean_sentences(loc + 'english-nepali-train.pkl')
test = load_clean_sentences(loc + 'english-nepali-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare nepali tokenizer
nep_tokenizer = create_tokenizer(dataset[:, 1])
nep_vocab_size = len(nep_tokenizer.word_index) + 1
nep_length = max_length(dataset[:, 1])

# prepare data
trainX = encode_sequences(nep_tokenizer, nep_length, train[:, 1])
testX = encode_sequences(nep_tokenizer, nep_length, test[:, 1])

In [None]:
# load model
model = load_model('model.h5')

In [None]:
translation = model.predict(testX, verbose=0)

In [None]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [None]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

In [None]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
# 	print(raw_dataset[0])
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i][0], raw_dataset[i][1]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[हामीलाई परोपकारी अर्थशास्त्रको जरुरत छ], target=[we need caring economics], predicted=[the the]
src=[महामारी सम्बन्धी डाटाले पूर्वव्यापी ढङ्गमा sars को सूचक केसमा खेल पशुहरूसँग सम्पर्क इतिहास भएको कुरा खुलासा गर्‍यो।], target=[epidemiological data revealed retrospectively that the index case of sars had a contact history with game animals], predicted=[the the the the the the the]
src=[उ मुर्मुरिदै आमाको बारेमा भन्दै थियो मैंले तिम्रो आमा बाहिर कुर्दै होला भंने ।], target=[he mumbled something about his mother and i told him shed probably find him outside anyway], predicted=[the the the the the the the]
src=[औँलाका नङहरूको मुनि फोहोर भएमा यसलाई हटाउनका लागि ब्रिस्टल ब्रश प्रयोग गर्न सकिन्छ।], target=[if there is debris under fingernails a bristle brush may be used to remove it], predicted=[the the the the the the]
src=[दर्शक मैले सम्भवत गर्न सक्छु होला अर्थर बेन्जामिन  म तपाईं संग पछि कुरा गरौँला ], target=[volunteer i can probably do it ab ill talk to you later], predicted=[

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.031293
BLEU-2: 0.089161
BLEU-3: 0.135540
BLEU-4: 0.150501
test
src=[सरकारी क्षेत्रको ऋणात्मक बचत र लगानीको अन्तरलाई विदेशी सहायता र आन्तरिक ऋणबाट बेहोरिनेछ।], target=[the deficit between the public sector negative savings and investment will be borne thorough foreign assistance and internal borrowing], predicted=[the the the the the the the the the]
src=[मृत्यु दर  बर्ष भन्दा माथिका लागि धेरै बढि तर  बर्ष भन्दा कमका लागि उल्लेखनीय रूपले कम छ।], target=[the fatality rate is much higher for those over  but significantly lower for those under ], predicted=[the the the the the the the]
src=[उनी  बर्षदेखि यो समस्यामा छिन], target=[she has lived with crisis for  years], predicted=[the the the]
src=[सिरियाली ब्लगर एस रिफाइ यस दोधारका बारे ट्वीट गर्छन्], target=[syrian blogger s rifai tweets about the confusion], predicted=[the the the the]
src=[नेपालका पुराना फोटोहरूका लागि ट्वीटरमा नेपाल इन पिक्सलाई पछ्याउनुहोस्], target=[for stunning glimpses of thcentury nepal check out nepal in 