<a href="https://colab.research.google.com/github/jaydeepthik/NMT-neural-machine-translation/blob/master/NMT_ger2eng_teacher_forcing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount("/content/drive")

In [2]:
import numpy as np
import string
from unicodedata import normalize
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#read file and make pairs

def read_file(file):
    
    with open(file, 'r', encoding='utf8') as f:
        pairs = [line.strip().split('\t') for line in f]
        #print(pairs)
    return pairs

def preprocess_pairs(data):
    cleaned = list()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))    
    
    for pairs in data:
      clean_pair = list()
      for i,line in enumerate(pairs):
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('utf8')
        line = line.lower()
        line = line.split()
        line = [re_punc.sub('', w) for w in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        #print(line)
        if not i:
          line = ["<START>"]+line+["<END>"]
        clean_pair.append(" ".join(line))
      cleaned.append(clean_pair)
    return cleaned

def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encode = to_categorical(sequence, vocab_size)
    ylist.append(encode)
  y = np.array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

Using TensorFlow backend.


In [11]:
data = read_file("/content/drive/My Drive/Colab Notebooks/nmt_data/deu-eng/deu.txt")
data = preprocess_pairs(data)
#data = np.array(data)
#np.random.shuffle(data)
n_sentences = 10000

reduced_data = data[:10000]
reduced_data = np.array(reduced_data)
np.random.shuffle(reduced_data)

train_data, test_data = reduced_data[:9000], reduced_data[9000:]
print(train_data[0])
print(train_data[1])
print(test_data[0])
print(test_data[1])

['<START> shes my wife <END>' 'sie ist meine frau']
['<START> tom agrees <END>' 'tom ist einverstanden']
['<START> i like fish <END>' 'ich esse gerne fisch']
['<START> we shook hands <END>' 'wir gaben uns die hand']


In [0]:
eng_tokenizer = Tokenizer(filters='"#$%&()*+,./:;=@[\\]^_`{|}~\t\n', lower=False)
eng_tokenizer.fit_on_texts(reduced_data[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_max_len = max(len(line.split()) for line in reduced_data[:,0])

ger_tokenizer = Tokenizer(filters='"#$%&()*+,./:;=@[\\]^_`{|}~\t\n', lower=False)
ger_tokenizer.fit_on_texts(reduced_data[:,1])
ger_vocab_size = len(ger_tokenizer.word_index)+1
ger_max_len = max(len(line.split()) for line in reduced_data[:,1])


X_train_inp = ger_tokenizer.texts_to_sequences(train_data[:,1])
X_train_inp = pad_sequences(X_train_inp, maxlen=ger_max_len, padding='post')

y_train = eng_tokenizer.texts_to_sequences(train_data[:,0])
y_train_inp = pad_sequences(y_train, maxlen=eng_max_len, padding='post')


#y_train_op = y_train[:,1:].tolist()
y_train_op = pad_sequences(y_train, maxlen=eng_max_len+1, padding='post')
y_train_op = y_train_op[:,1:]
y_train_op_encoded = encode_output(y_train_op, eng_vocab_size)

#print(y_train_op[0])
X_test = ger_tokenizer.texts_to_sequences(test_data[:,1])
X_test = pad_sequences(X_test, maxlen=ger_max_len, padding='post')


y_test = eng_tokenizer.texts_to_sequences(test_data[:,0])
y_test = pad_sequences(y_test, maxlen=eng_max_len, padding='post')
y_test_enc = encode_output(y_test, eng_vocab_size)




In [39]:
print(y_train_inp.shape, y_train_op.shape, y_train_op_encoded.shape)
print(y_train_inp[0], y_train_op[0])

(9000, 7) (9000, 7) (9000, 7, 2235)
[  1 235  25 427   2   0   0] [235  25 427   2   0   0   0]


ENGLISH 2 GERMAN

In [34]:
#model ger -> eng
from keras.models import Model
from keras.layers import LSTM, Dense, Input, Embedding, TimeDistributed

g2e_enc_inp = Input(shape=(None,))
g2e_enc_emb = Embedding(input_dim=ger_vocab_size, output_dim=300, mask_zero=True)
g2e_enc_x   = g2e_enc_emb(g2e_enc_inp)
g2e_enc_lstm  = LSTM(256, return_state=True)
g2e_enc_x, g2e_state_h, g2e_state_c = g2e_enc_lstm(g2e_enc_x)
g2e_states = [g2e_state_h, g2e_state_c]

g2e_dec_inp = Input(shape=(None,))
g2e_dec_emb = Embedding(input_dim=eng_vocab_size, output_dim=300, mask_zero=True)
g2e_dec_x = g2e_dec_emb(g2e_dec_inp)
g2e_dec_lstm = LSTM(256, return_sequences=True, return_state=True)
g2e_dec_x,_,_ = g2e_dec_lstm(g2e_dec_x, initial_state=g2e_states)
g2e_dec_dense= Dense(eng_vocab_size, activation='softmax')

g2e_dec_op = g2e_dec_dense(g2e_dec_x)

g2e_model =  Model([g2e_enc_inp, g2e_dec_inp], g2e_dec_op)
g2e_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
g2e_model.summary()
g2e_model.fit([X_train_inp, y_train_inp], y_train_op_encoded, batch_size=64, epochs=30, validation_split=0.2,shuffle=True)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 300)    1069800     input_13[0][0]                   
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, None, 300)    670500      input_14[0][0]                   
__________________________________________________________________________________________________
lstm_7 (LS

<keras.callbacks.History at 0x7f37a3efefd0>

INFERENCE MODEL

In [35]:
encoder_model = Model(g2e_enc_inp,g2e_states)

decoder_state_inp_h = Input(shape=(256, ))
decoder_state_inp_c = Input(shape=(256, ))
decoder_state_inp = [decoder_state_inp_h, decoder_state_inp_c]

dec_x_inp = g2e_dec_emb(g2e_dec_inp)
decoder_outputs, state_h, state_c = g2e_dec_lstm(dec_x_inp , initial_state=decoder_state_inp)
decoder_states = [state_h, state_c]

decoder_outputs = g2e_dec_dense(decoder_outputs)

decoder_model = Model([g2e_dec_inp]+decoder_state_inp,[decoder_outputs]+decoder_states)
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, None, 300)    670500      input_14[0][0]                   
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, 256)          0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 256)          0                                            
__________________________________________________________________________________________________
lstm_8 (LS

In [36]:
idx_to_word_inp = dict((i,word) for word, i in ger_tokenizer.word_index.items())
idx_to_word_op = dict((i,word) for word, i in eng_tokenizer.word_index.items())
eng_tokenizer.word_index['<START>']

1

In [0]:
def decode_seq(inp_seq):
  
  encoder_states = encoder_model.predict(inp_seq)
  
  target_seq = np.zeros(( 1, 1))
  target_seq[0,0] = eng_tokenizer.word_index['<START>']
  
  stop = False
  
  sentence = ""
  
  while(not stop):
    output_vec, h, c = decoder_model.predict([target_seq]+encoder_states)
    op_id = np.argmax(output_vec[0,0,:])
    sampled_word = idx_to_word_op[op_id]
    sentence+=" "+sampled_word
    
    #print("sampled:", sampled_word)
    
    
    if (sampled_word == "<END>" or len(sentence.split())>=eng_max_len):
      stop = True
    #print(stop)  
    target_seq[0,0] =  op_id
    
    encoder_states = [h, c]
    #print(output_vec.shape)
  return sentence

In [43]:

input_seq = X_train_inp[10]
op_seq = y_train[10]
print(" ".join([idx_to_word_inp[i] for i in input_seq if i>0]))
print(" ".join([idx_to_word_op[i] for i in op_seq if i>0]))
input_seq

decode_seq(input_seq)

sie hat ihn erwurgt
<START> she choked him <END>


' do you smoke <END>'

In [24]:
eng_vocab_size

2235