<a href="https://colab.research.google.com/github/jaydeepthik/NMT-neural-machine-translation/blob/master/NMT_ger2eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount("/content/drive")

In [0]:
import numpy as np
import string
from unicodedata import normalize
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#read file and make pairs

def read_file(file):
    
    with open(file, 'r', encoding='utf8') as f:
        pairs = [line.strip().split('\t') for line in f]
        #print(pairs)
    return pairs

def preprocess_pairs(data):
    cleaned = list()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))    
    
    for pairs in data:
      clean_pair = list()
      for line in pairs:
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('utf8')
        line = line.lower()
        line = line.split()
        line = [re_punc.sub('', w) for w in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        #print(line)
        clean_pair.append(" ".join(line))
      cleaned.append(clean_pair)
    return cleaned

def encode_optput(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encode = to_categorical(sequence, vocab_size)
    ylist.append(encode)
  y = np.array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

In [0]:
data = read_file("/content/drive/My Drive/Colab Notebooks/nmt_data/deu-eng/deu.txt")
data = preprocess_pairs(data)
n_sentences = 10000

reduced_data = data[:10000]
reduced_data = np.array(reduced_data)
np.random.shuffle(reduced_data)

train_data, test_data = reduced_data[:9000], reduced_data[9000:]



In [0]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(reduced_data[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_len = max(len(line.split()) for line in reduced_data[:,0])

ger_tokenizer = Tokenizer()
ger_tokenizer.fit_on_texts(reduced_data[:,1])
ger_vocab_size = len(ger_tokenizer.word_index)+1
ger_max_len = max(len(line.split()) for line in reduced_data[:,1])


X_train = ger_tokenizer.texts_to_sequences(train_data[:,1])
X_train = pad_sequences(X_train, maxlen=ger_max_len, padding='post')

y_train = eng_tokenizer.texts_to_sequences(train_data[:,0])
y_train = pad_sequences(y_train, maxlen=eng_max_len, padding='post')
y_train = encode_optput(y_train, eng_vocab_size)


X_test = ger_tokenizer.texts_to_sequences(test_data[:,1])
print(X_test[0])
X_test = pad_sequences(X_test, maxlen=ger_max_len, padding='post')

y_test = eng_tokenizer.texts_to_sequences(test_data[:,0])
y_test = pad_sequences(y_test, maxlen=eng_max_len, padding='post')
y_test = encode_optput(y_test, eng_vocab_size)




In [53]:
import keras
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim=ger_vocab_size,output_dim = 256,input_length= ger_max_len, mask_zero=True))
model.add(layers.LSTM(256))
model.add(layers.RepeatVector(eng_max_len))
model.add(layers.LSTM(256, return_sequences=True))
model.add(layers.TimeDistributed(layers.Dense(eng_vocab_size, activation='softmax')))

model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['acc'])          
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 256)           912896    
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2233)           573881    
Total params: 2,537,401
Trainable params: 2,537,401
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 9000 samples, validate on 1000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f0f3e2565f8>