In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import math
import download,europarl

from keras.models import Model
from keras.layers import GRU,Input,Dense,Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard

In [25]:
# The dataset used is taken from europarl,which consists of translations in different pairs of languages spoken at the
# european union parliament.

language_code = 'fr' # french --> English

europarl.maybe_download_and_extract(language_code=language_code)

Data has apparently already been downloaded and unpacked.


In [26]:
# Getting the source(french) and destination(english) corpus

mark_start = 'ssss '
mark_end = ' eeee'
source_data = europarl.load_data(english=False,language_code=language_code)
dest_data = europarl.load_data(english=True,language_code=language_code,start=mark_start,end=mark_end)

In [27]:
# Example

source_data[0]

'Reprise de la session'

In [28]:
dest_data[0]

'ssss Resumption of the session eeee'

In [29]:
class TokenizerPlus(Tokenizer):
    '''Adding some more functionality to the Tokenizer class.'''
    
    def __init__(self,texts,padding,reverse=False,number_of_words=10000):
        '''
        parameters:
        texts - input data(source text)
        padding - a string as 'pre' or 'post' which tells the postion of padding/truncating
        reverse - whether a list of integer tokens has to be reversed for better model predictions(helps the model learn 
                  short term dependencies better)
        number_of_words - size of chosen dictionary
        '''
        
        Tokenizer.__init__(self,10000)
        # fits to generate a 10,000 word vocabulary.The indices are given based on word's frequnecy in the corpus
        self.fit_on_texts(texts)
        # maps indices back to words
        self.index_to_word = dict(zip(self.word_index.values(),self.word_index.keys()))
        # maps all sequences in the data to list of tokens where each word in a sequence corresponds to a word.
        # So, tokens is a list of lists.
        self.tokens = self.texts_to_sequences(texts)
        
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
        
        # calculating the maximum length of sequences we can consider for  
        self.number_of_tokens = [len(x) for x in self.tokens]
        self.max_tokens = int(np.mean(self.number_of_tokens) + 2*np.std(self.number_of_tokens))
        
        self.padded_tokens = pad_sequences(self.tokens,self.max_tokens,padding=padding,truncating=truncating)
    
    def token_to_word(self,token):
        ''' returns a word corresponding to the token'''
        return " " if token==0 else self.index_to_word[token]
    
    def tokens_to_string(self,tokens):
        ''' combines the words to create a sequence for a list of tokens'''
        return ' '.join([self.index_to_word[x] for x in tokens if x!=0])
    
    def text_to_tokens(self,text,reverse=False,padding=False):
        token_list_for_text = np.array(self.texts_to_sequences([text]))
        if reverse:
            token_list_for_text = np.flip(token_list_for_text,axis=-1)
            truncating = 'pre'
        else:
            truncating = 'post'
        
        if padding:
            token_list_for_text = pad_sequences(token_list_for_text,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return token_list_for_text        

In [30]:
# Initializing tokenizers for source and destination languages
source_tokenizer = TokenizerPlus(texts=source_data,padding='pre',reverse=True,number_of_words=10000)
dest_tokenizer = TokenizerPlus(texts=dest_data,padding='post',reverse=False,number_of_words=10000)

In [31]:
# Variables for storing padded tokens of source and destination.The padded tokens are a 2-d np array,where each array has 
# tokens for a sequence present in the corpus.
source_tokens = source_tokenizer.padded_tokens
dest_tokens = dest_tokenizer.padded_tokens

In [35]:
# Tokens for start and end of texts,i.e integers that represent start and end of text.
start_index = dest_tokenizer.word_index[mark_start.strip()]
print(start_index)
end_index = dest_tokenizer.word_index[mark_end.strip()]
print(end_index)

2
3


In [36]:
# Examples

# Sample tokens from source language
source_tokens[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 5571, 1058,    1,  634,  366,   41,    8, 8073,    9,
        278,   67,   41,   17,    3,  385,  923, 1810, 3580,    4, 5129,
         45,  410,   14,   56,   42,   11, 1199,    2, 1653, 2430,   17])

In [38]:
# mapping the tokens back to words
source_tokenizer.tokens_to_string(source_tokens[1])

'vacances bonnes de passé avez vous que espérant en mes tous vous je et dernier décembre 17 vendredi le interrompue été avait qui européen parlement du session la reprise déclare je'

In [39]:
# the actual data corresponding to the tokens.Clearly the 2 strings are reversed
source_data[1]

'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.'

In [40]:
dest_tokens[1]

array([   2,   13, 2742, 2976,    1, 1131,    4,    1,   24,   50, 4822,
         15, 3308, 1830,  904,  985,    6,   13,   31,   58,  339,  254,
          5,  298,   43,    9, 1299,   78,  162,    7,    1,  194,    8,
         43, 3799,    9, 6847,  492,    3,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [41]:
dest_tokenizer.tokens_to_string(dest_tokens[1])

'ssss i declare resumed the session of the european parliament adjourned on friday 17 december 1999 and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant period eeee'

In [42]:
dest_data[1]

'ssss I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period. eeee'

In [43]:
# The destination language tokens have not been reversed.

In [60]:
# The architecture for machine translation involves 2 RNN, one that encodes the source language into some meaningful thought
# vecot that RNN can understand, and a decoder which decodes the input from encoder.The way it works is that during training
# the decoder is given the corresponding translation of the source language and the decoder output is same as the decoder
# input , just shifted one time-step ahead so that the nn can learn the correct mappings in a supervised way.

encoder_input_data = source_tokens
decoder_input_data = dest_tokens[:,:-1]
decoder_output_data = dest_tokens[:,1:]

In [61]:
decoder_input_data[1]

array([   2,   13, 2742, 2976,    1, 1131,    4,    1,   24,   50, 4822,
         15, 3308, 1830,  904,  985,    6,   13,   31,   58,  339,  254,
          5,  298,   43,    9, 1299,   78,  162,    7,    1,  194,    8,
         43, 3799,    9, 6847,  492,    3,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [62]:
decoder_output_data[1] # shifted one-time step from input

array([  13, 2742, 2976,    1, 1131,    4,    1,   24,   50, 4822,   15,
       3308, 1830,  904,  985,    6,   13,   31,   58,  339,  254,    5,
        298,   43,    9, 1299,   78,  162,    7,    1,  194,    8,   43,
       3799,    9, 6847,  492,    3,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [47]:
# Encoder
# Instead of using the built in keras models like Sequential,we'll instead create the layer objects and connect them 
# manually later.This gives us more freedom and leverage to customize our model.

encoder_input = Input(shape=(None,),name='encoder_input')
encoder_embedding = Embedding(input_dim=10000,output_dim=128,name='encoder_embedding')
units = 512
encoder_gru1 = GRU(units,name='encoder_gru1',return_sequences=True)
encoder_gru2 = GRU(units,name='encoder_gru2',return_sequences=True)
encoder_gru3 = GRU(units,name='encoder_gru3',return_sequences=False)

#connect the layers

layer = encoder_input
layer = encoder_embedding(layer)
layer = encoder_gru1(layer)
layer = encoder_gru2(layer)
layer = encoder_gru3(layer)

encoder_output = layer


In [68]:
# Decoder
state_size = 512
decoder_initial = Input(shape=(state_size,), name='decoder_initial')
decoder_input = Input(shape=(None,),name='decoder_input')
decoder_embedding = Embedding(input_dim=10000,output_dim=128,name='decoder_embedding')

decoder_gru1 = GRU(state_size,name='decoder_gru1',return_sequences=True)
decoder_gru2 = GRU(state_size,name='decoder_gru2',return_sequences=True)
decoder_gru3 = GRU(state_size,name='decoder_gru3',return_sequences=True)

decoder_dense = Dense(10000,activation='linear',name='decoder_output')

# connect the layers
def connect_decoder(initial_state):
    
    layer_d = decoder_input
    layer_d = decoder_embedding(layer_d)
    layer_d = decoder_gru1(layer_d,initial_state=initial_state)
    layer_d = decoder_gru2(layer_d,initial_state=initial_state)
    layer_d = decoder_gru3(layer_d,initial_state=initial_state)
    decoder_output = decoder_dense(layer_d)
    return decoder_output


In [69]:
# Creating the models
# To realise this complex architecture, we'll build 3 models.
#1. A training model which connects the encoder and decoder end to end.Inputs will be the encoder input and decoder input
# and output will be decoder output.In short the inputs the decoder will be connected to the output of the encoder.
#2. A separate model for encoder which outputs the thought-vector,summary of the input language sequence.
#3. A separate model for the decoder.

#1. input of encoder is fed into the decoder
decoder_output = connect_decoder(initial_state=encoder_output)
training_model = Model(inputs=[encoder_input,decoder_input],outputs=[decoder_output])

#2
encoder_model = Model(inputs=[encoder_input],outputs=[encoder_output])

#3
decoder_output = connect_decoder(initial_state=decoder_initial)
decoder_model = Model(inputs=[decoder_input,decoder_initial],outputs=[decoder_output])

In [70]:
def compute_loss(labels,predictions):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=predictions,labels=labels))
    return loss

optimizer = RMSprop(lr=0.001)
decoder_target = tf.placeholder(dtype='int32',shape=(None,None))
training_model.compile(optimizer=optimizer,loss=compute_loss,target_tensors=[decoder_target])

In [71]:
# callbacks to facilitate evaluation
path_checkpoint = 'machine_trans_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,monitor='val_loss',verbose=1,save_weights_only=True,
                                      save_best_only=True)
callback_early_stopping = EarlyStopping(monitor='val_loss',patience=3, verbose=1)
callback_tensorboard = TensorBoard(log_dir='./machine_trans_logs/',histogram_freq=0,write_graph=False)
callbacks = [callback_early_stopping,callback_checkpoint,callback_tensorboard]

In [72]:
x_data = {'encoder_input':encoder_input_data,'decoder_input':decoder_input_data}
y_data = {'decoder_output':decoder_output_data}
validation_split = 10000 / len(encoder_input_data)

In [1]:
training_model.fit(x=x_data,y=y_data,batch_size=640,epochs=10,validation_split=validation_split,callbacks=callbacks)

In [74]:
def translate(input_text, true_output_text=None):
    """Translate a single text-string."""

    
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    initial_state = model_encoder.predict(input_tokens)
    max_tokens = tokenizer_dest.max_tokens
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)
    token_int = token_start
    output_text = ''
    count_tokens = 0
    while token_int != token_end and count_tokens < max_tokens:
        
        decoder_input_data[0, count_tokens] = token_int

        
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }
      
        decoder_output = model_decoder.predict(x_data)
        token_onehot = decoder_output[0, count_tokens, :]
        token_int = np.argmax(token_onehot)
        sampled_word = tokenizer_dest.token_to_word(token_int)
        output_text += " " + sampled_word    
        count_tokens += 1
        
    output_tokens = decoder_input_data[0]
    
    print("Input text:")
    print(input_text)
    print()

 
    print("Translated text:")
    print(output_text)
    print()

   
    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()