<a href="https://colab.research.google.com/github/kumaranu7/Machine-Translation/blob/master/Machine_Translation_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import string
import re
from pickle import dump, load
from unicodedata import normalize
from numpy import array

In [3]:
def load_doc(filename):
  file = open(filename, 'rt', encoding='utf-8')
  text = file.read()
  file.close()
  return text

def to_pairs(doc):
  lines = doc.strip().split('\n')
  pairs = [line.split('\t') for line in lines]
  return pairs

def clean_pairs(lines):
  cleaned = [] #to append all the sentences

  re_print = re.compile('[^%s]' % re.escape(string.printable)) #filtering characters
  table = str.maketrans('', '', string.punctuation) #removing punctuations by translation table

  for pair in lines:
    clean_pair = [] #to append every cleaned pair in every sentences
    for line in pair: 
      line = normalize('NFD',  line).encode('ascii', 'ignore')
      line = line.decode('UTF-8')
      line = line.split()
      line = [word.lower() for word in line]
      line = [word.translate(table) for word in line]
      line = [re_print.sub('', w) for w in line]
      line = [word for word in line if word.isalpha()] #detecting if each word is aplhabet / removing tokens
      clean_pair.append(' ' .join(line))
    cleaned.append(clean_pair)
  return array(cleaned)

def save_clean_data(sentences, filename):
  dump(sentences, open(filename, 'wb'))
  print('Saved %s' % filename)

#loading the dataset
filename = '/content/drive/My Drive/Colab Notebooks/Projects/deu-eng/deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-german.pkl')

#spot check 
for i in range(5):
  print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]


In [4]:
#saving data and splitting the dataset into train, test
from pickle import load, dump
from numpy.random import rand, shuffle
def load_clean_dataset(filename):
  return load(open(filename, 'rb'))

raw_dataset = load_clean_dataset('english-german.pkl')
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
shuffle(dataset)
train, test = dataset[:9000], dataset[9000:]
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')
save_clean_data(dataset, 'english-german-both.pkl')


Saved english-german-train.pkl
Saved english-german-test.pkl
Saved english-german-both.pkl


In [7]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Using TensorFlow backend.


English Vocabulary Size: 2200
English Max Length: 5
German Vocabulary Size: 3529
German Max Length: 9



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 256)            903424    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2200)           565400    
Total params: 2,519,448
Trainable params: 2,519,448

<keras.callbacks.History at 0x7fb1f7d2dcc0>

In [0]:
from keras.models import load_model
model = load_model('model.h5')


In [0]:
def word_for_id(integers, tokenizer):
  for word, index in tokenizer.word_index.items():
      if index == integer:
        return word
  return None

In [0]:
def predict_sequences(model, tokenizer, source):
  predictions = model.predict(source, verbose = 0)[0]
  integers = [argmax(vector) for vector in predictions]
  target = list()
  for i in intergers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.appennd(word)
    return ''.join(target)

In [0]:
def evaluate_model(model, tokenizer, source, raw_dataset):
  acutal, predicted = list(), list()
  for i, source in enumerate(sources):
    source = sources.reshape(1, source.shape[0])
    translation = predict_sequences(model, eng_tokenizer, source)
    raw_target, raw_src = raw_dataset[i]

    if i < 10:
      print('src[%s], target[%s], predicted[%s]' %(raw_src, raw_target, translation))
    actual.append([raw_target.split()])
    predicted.append(translation.split())
  
  #print BLUE score
    print('BLEU - 1 %f' % corpus_bleu(actual, predicted, weights = (1.0, 0, 0, 0)))
    print('BLEU - 2 %f' % corpus_bleu(actual, predicted, weights = (0.5, 0.5, 0, 0)))
    print('BLEU - 3 %f' % corpus_bleu(actual, predicted, weights = (0.33,0.33,0.33,0)))
    print('BLEU - 4 %f' % corpus_bleu(actual, predicted, weights = (0.25,0.25,0.25,0.25)))
