In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/ML DL Assignment and Quizzes/Deep Learning/DL Notes/DL Phase 3 Codes

/content/drive/.shortcut-targets-by-id/11fup9dLOHGdLYHRf3LudO-ZRG9B97O1b/ML DL Assignment and Quizzes/Deep Learning/DL Notes/DL Phase 3 Codes


About the Dataset:
- It contains english phrases and their corresponding german translations

In [3]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [4]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
 # The file is opened in UTF-8 as ASCII mode does not have representation for
 # some german characters.

	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [5]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

In [6]:
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))

	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters by converting all of them to ASCII
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

`re_print = re.compile('[^%s]' % re.escape(string.printable))`
<br>
- It creates a regular expression pattern object that matches any character that is not a printable ASCII character. The re.compile() function compiles the regular expression pattern into a regular expression object that can be used to search for non-printable characters in a string.

- The string.printable constant is a string of all ASCII characters considered printable. The % operator is used to substitute the string.printable constant into the regular expression pattern. The re.escape() function is used to escape any special characters in the string.printable constant so that they are treated as literal characters in the regular expression pattern.

- The resulting regular expression pattern object, re_print, can be used with the re.search() or re.sub() functions to search for or replace non-printable characters in a string.

In [7]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [8]:
# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [da

In [9]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

In [10]:
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [11]:
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
# selecting sentences using slicing
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [12]:
from pickle import load
from numpy import array
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [13]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

In [14]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
# tokenizer will associate each word to a number and convert the text to
# vector. Now once this is done any sentence which comes in future will be
# converted to vector in the same fashion as we have the numerical representation
# of the vocabulary.
# Ensure that furture sentences do not have any out of vocabulary words
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [15]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [16]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# input is a tokenizer, maxium length of sentence in that language, and lines
	# of that language

	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
 # converting list of sentences to list of vectors

	# pad sequences with 0 values as once we know the maximum length of the sentence
	# we know in translating it wont go beyond it
	X = pad_sequences(X, maxlen=length, padding='post')
 # post means the zeroes will be padded after the sentence
	return X


In [17]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y


`define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units)`
- src_vocab: germany vocabulary size
- tar_vocab: english vocabulary size
- src_timesteps: maximum length of sentence in germany
- tar_timesteps: maximum length of sentence in english
- n_units: number of neurons in the LSTM layer

In [18]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps))
 # you can use word2vec or glove embeddings also

	model.add(LSTM(n_units)) # 1 layers of LSTM for encoder

	model.add(RepeatVector(tar_timesteps))
 # here we are passing the encoded information of encoder to all timesteps of
 # decoder, by repeating the last time step of encoder for tar_timestep number of
 # times, and put the LSTM decoder on top of it. Now these repeated time steps of
 # encoder are passed to the timesteps of decoder correspondingly

	model.add(LSTM(n_units, return_sequences=True)) # LSTM for decoder

	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	# here we are un rolling the decoder LSTM in time, and adding a dense layer with
	# number of neurons equal to number of words in english so that encoder and
	# decoder can be connected.

	return model

In [19]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [20]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
# we are tokenizing only the english words

eng_vocab_size = len(eng_tokenizer.word_index) + 1 # +1 is for <START>
eng_length = max_length(dataset[:, 0])
# to find the maximum length of an english sentence
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
# we are tokenizing only the german words
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 2404
English Max Length: 5
German Vocabulary Size: 3856
German Max Length: 10


In [21]:
# prepare training data to translate from german to english
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
print(trainY[0])

trainY = encode_output(trainY, eng_vocab_size)
# encoding output as one hot encoded vectors


print(trainY[0])
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

[ 10   3 188  31   0]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


`model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)`
- ger_vocab_size: germany vocabulary size
- eng_vocab_size: english vocabulary size
- ger_length: maximum length of sentence in germany
- eng_length: maximum length of sentence in english
- 256: number of neurons in the LSTM layer

In [22]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 256)           987136    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVecto  (None, 5, 256)            0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDist  (None, 5, 2404)           617828    
 ributed)                                                        
                                                                 
Total params: 2655588 (10.13 MB)
Trainable params: 26555

In [23]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=15, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Epoch 1/15

Epoch 1: val_loss improved from inf to 3.79469, saving model to model.h5


  saving_api.save_model(


141/141 - 31s - loss: 4.2881 - val_loss: 3.7947 - 31s/epoch - 222ms/step
Epoch 2/15

Epoch 2: val_loss improved from 3.79469 to 3.69118, saving model to model.h5
141/141 - 27s - loss: 3.6882 - val_loss: 3.6912 - 27s/epoch - 191ms/step
Epoch 3/15

Epoch 3: val_loss improved from 3.69118 to 3.63103, saving model to model.h5
141/141 - 26s - loss: 3.6015 - val_loss: 3.6310 - 26s/epoch - 183ms/step
Epoch 4/15

Epoch 4: val_loss improved from 3.63103 to 3.53118, saving model to model.h5
141/141 - 25s - loss: 3.4937 - val_loss: 3.5312 - 25s/epoch - 174ms/step
Epoch 5/15

Epoch 5: val_loss improved from 3.53118 to 3.46423, saving model to model.h5
141/141 - 26s - loss: 3.3727 - val_loss: 3.4642 - 26s/epoch - 184ms/step
Epoch 6/15

Epoch 6: val_loss improved from 3.46423 to 3.42054, saving model to model.h5
141/141 - 26s - loss: 3.2749 - val_loss: 3.4205 - 26s/epoch - 188ms/step
Epoch 7/15

Epoch 7: val_loss improved from 3.42054 to 3.34896, saving model to model.h5
141/141 - 26s - loss: 3.1819

<keras.src.callbacks.History at 0x7be3e85d3370>

# Code to translate new sentence

In [24]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[wohnen sie hier], target=[do you live here], predicted=[do you it]
src=[sei objektiv], target=[be objective], predicted=[be careful]
src=[das war laut], target=[that was loud], predicted=[she was easy]
src=[bitte setzt euch hierhin], target=[please sit here], predicted=[please please here]
src=[ich habe mir nichts dabei gedacht], target=[i didnt mean it], predicted=[i like to]
src=[ich bin online], target=[i am online], predicted=[i am busy]
src=[ich mag eiscreme], target=[i love ice cream], predicted=[i like to]
src=[zeig sie ihm], target=[show it to him], predicted=[do it to me]
src=[hat tom ihn gefunden], target=[did tom find it], predicted=[did tom do it]
src=[er hat aufgelegt], target=[he hung up], predicted=[he is]
test
src=[tom setzte sich], target=[tom sat down], predicted=[tom will]
src=[ich habe zu tun], target=[im busy], predicted=[i am a]
src=[seien sie nicht unhoflich], target=[dont be rude], predicted=[dont be me]
src=[ich bin erschopft], target=[i am exhausted