<a href="https://colab.research.google.com/github/jaydeepthik/NMT-neural-machine-translation/blob/master/NMT_ger2eng_teacher_forcing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import string
from unicodedata import normalize
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#read file and make pairs

def read_file(file):
    
    with open(file, 'r', encoding='utf8') as f:
        pairs = [line.strip().split('\t') for line in f]
        #print(pairs)
    return pairs

def preprocess_pairs(data):
    cleaned = list()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))    
    
    for pairs in data:
      clean_pair = list()
      for i,line in enumerate(pairs):
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('utf8')
        line = line.lower()
        line = line.split()
        line = [re_punc.sub('', w) for w in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        #print(line)
        if not i:
          line = ["<START>"]+line+["<END>"]
        clean_pair.append(" ".join(line))
      cleaned.append(clean_pair)
    return cleaned

def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encode = to_categorical(sequence, vocab_size)
    ylist.append(encode)
  y = np.array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

In [4]:
data = read_file("/content/drive/My Drive/Colab Notebooks/nmt_data/deu-eng/deu.txt")
data = preprocess_pairs(data)
#data = np.array(data)
#np.random.shuffle(data)
n_sentences = 60000

reduced_data = data[:40000]
reduced_data = np.array(reduced_data)
np.random.shuffle(reduced_data)

train_data, test_data = reduced_data[:30000], reduced_data[30000:]
print(train_data[0])
print(train_data[1])
print(test_data[0])
print(test_data[1])

['<START> my father isnt home <END>' 'mein vater ist nicht zuhause']
['<START> are you in trouble <END>' 'bist du in schwierigkeiten']
['<START> i do want it <END>' 'ich mochte es']
['<START> he kept his word <END>' 'er hat sein wort gehalten']


In [5]:
len(data)

192881

In [0]:
eng_tokenizer = Tokenizer(filters='"#$%&()*+,./:;=@[\\]^_`{|}~\t\n', lower=False)
eng_tokenizer.fit_on_texts(reduced_data[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_len = max(len(line.split()) for line in reduced_data[:,0])

ger_tokenizer = Tokenizer(filters='"#$%&()*+,./:;=@[\\]^_`{|}~\t\n', lower=False)
ger_tokenizer.fit_on_texts(reduced_data[:,1])
ger_vocab_size = len(ger_tokenizer.word_index)+1
ger_max_len = max(len(line.split()) for line in reduced_data[:,1])


X_train_inp = ger_tokenizer.texts_to_sequences(train_data[:,1])
X_train_inp = pad_sequences(X_train_inp, maxlen=ger_max_len, padding='post')

y_train = eng_tokenizer.texts_to_sequences(train_data[:,0])
y_train_inp = pad_sequences(y_train, maxlen=eng_max_len, padding='post')


#y_train_op = y_train[:,1:].tolist()
y_train_op = pad_sequences(y_train, maxlen=eng_max_len+1, padding='post')
y_train_op = y_train_op[:,1:]
y_train_op_encoded = encode_output(y_train_op, eng_vocab_size)

#print(y_train_op[0])
X_test = ger_tokenizer.texts_to_sequences(test_data[:,1])
X_test = pad_sequences(X_test, maxlen=ger_max_len, padding='post')


y_test = eng_tokenizer.texts_to_sequences(test_data[:,0])
y_test = pad_sequences(y_test, maxlen=eng_max_len, padding='post')
y_test_enc = encode_output(y_test, eng_vocab_size)




In [7]:
print(y_train_inp.shape, y_train_op.shape, y_train_op_encoded.shape)
print(y_train_inp[0], y_train_op[0])


(30000, 8) (30000, 8) (30000, 8, 5539)
[  1  18 224  56  70   2   0   0] [ 18 224  56  70   2   0   0   0]


ENGLISH 2 GERMAN

In [9]:
#model ger -> eng

from keras.models import Model
from keras.layers import LSTM, Dense, Input, Embedding, TimeDistributed

g2e_enc_inp = Input(shape=(None,))
g2e_enc_emb = Embedding(input_dim=ger_vocab_size, output_dim=50, mask_zero=True)
g2e_enc_x   = g2e_enc_emb(g2e_enc_inp)
g2e_enc_lstm  = LSTM(50, return_state=True)
g2e_enc_x, g2e_state_h, g2e_state_c = g2e_enc_lstm(g2e_enc_x)
g2e_states = [g2e_state_h, g2e_state_c]

g2e_dec_inp = Input(shape=(None,))
g2e_dec_emb = Embedding(input_dim=eng_vocab_size, output_dim=50, mask_zero=True)
g2e_dec_x = g2e_dec_emb(g2e_dec_inp)
g2e_dec_lstm = LSTM(50, return_sequences=True, return_state=True)
g2e_dec_x,_,_ = g2e_dec_lstm(g2e_dec_x, initial_state=g2e_states)
g2e_dec_dense= Dense(eng_vocab_size, activation='softmax')

g2e_dec_op = g2e_dec_dense(g2e_dec_x)

g2e_model =  Model([g2e_enc_inp, g2e_dec_inp], g2e_dec_op)
g2e_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
g2e_model.summary()

g2e_model.fit([X_train_inp, y_train_inp], y_train_op_encoded, batch_size=64, epochs=60, validation_split=0.2,shuffle=True)


Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     449300      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     276950      input_2[0][0]                    
_____________________________________

<keras.callbacks.History at 0x7f3568a18240>

INFERENCE MODEL

In [29]:
encoder_model = Model(g2e_enc_inp,g2e_states)

decoder_state_inp_h = Input(shape=(50, ))
decoder_state_inp_c = Input(shape=(50, ))
decoder_state_inp = [decoder_state_inp_h, decoder_state_inp_c]

dec_x_inp = g2e_dec_emb(g2e_dec_inp)
decoder_outputs, state_h, state_c = g2e_dec_lstm(dec_x_inp , initial_state=decoder_state_inp)
decoder_states = [state_h, state_c]

decoder_outputs = g2e_dec_dense(decoder_outputs)

decoder_model = Model([g2e_dec_inp]+decoder_state_inp,[decoder_outputs]+decoder_states)
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     276950      input_2[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
lstm_2 (LS

In [30]:
idx_to_word_inp = dict((i,word) for word, i in ger_tokenizer.word_index.items())
idx_to_word_op = dict((i,word) for word, i in eng_tokenizer.word_index.items())
eng_tokenizer.word_index['<START>']

1

In [0]:
def decode_seq(inp_seq):
  
  encoder_states = encoder_model.predict(inp_seq)
  
  target_seq = np.zeros(( 1, 1))
  target_seq[0,0] = eng_tokenizer.word_index['<START>']
  
  stop = False
  
  sentence = ""
  
  while(not stop):
    output_vec, h, c = decoder_model.predict([target_seq]+encoder_states)
    op_id = np.argmax(output_vec[0,-1,:])
    sampled_word = idx_to_word_op[op_id]
    sentence+=" "+sampled_word
    
    #print("sampled:", sampled_word)
    
    
    if (sampled_word == "<END>" or len(sentence.split())>=eng_max_len):
      stop = True
    #print(stop)
    target_seq = np.zeros(( 1, 1))
    target_seq[0,0] =  op_id
    
    encoder_states = [h, c]
    #print(output_vec.shape)
  return sentence

PREDICTIONS..

In [40]:
id = 8695
#X_train_inp[id:id+1].shape
#print(X_train_inp[20:21])
input_seq = X_train_inp[id:id+1]
op_seq = y_train_inp[id:id+1]
print("INPUT: "+" ".join([idx_to_word_inp[i] for i in input_seq[0] if i>0]))
print("ACTUAL: "+" ".join([idx_to_word_op[i] for i in op_seq[0] if i>0]))
input_seq
print("PREDICTED: ",decode_seq(input_seq))

INPUT: ich bin mude
ACTUAL: <START> i am tired <END>
PREDICTED:   im tired <END>


In [0]:
idx_to_word_op[0]