In [1]:
#Importing the libraries
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
#Loading the dataset
data_path = "C:/Users/Mrinal Kalita/Python Projects/MAchine Translation/fra.txt"

In [3]:
#Vectorizing the data and storing them into a list
input_texts = []
target_texts = []

#Creating vocabulary of unique characters
input_characters = set()
target_characters = set()

with open(data_path, 'r', encoding="utf-8") as f:
    lines = f.read().split('\n')

for line in lines[:30000]: #TAking only the 50000 samples

    input_text,target_text,_= line.split("\t")
    target_text = "\t" + target_text + "\n" 
    # We use "tab" as the "start sequence" character for the targets, and "\n" as "end sequence" character.
    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [4]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
max_sequence_length_input = max(len(txt) for txt in input_texts)
max_sequence_length_target = max(len(txt) for txt in target_texts)

print('Total number of samples:', len(input_texts))
print('Number of unique input tokens:', len(input_characters))
print('Number of unique target tokens:', len(target_characters))
print('Max sequence length for inputs:', max_sequence_length_input)
print('Max sequence length for target:', max_sequence_length_target)

Total number of samples: 30000
Number of unique input tokens: 76
Number of unique target tokens: 103
Max sequence length for inputs: 18
Max sequence length for target: 59


### Tokenizing and Vectorizing the texts

In [5]:
input_token_index = dict([(char,i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char,i) for i, char in enumerate(target_characters)])

In [6]:
# Creating array of zeros with mentioned shape
encoder_input_data = np.zeros((len(input_texts), max_sequence_length_input, len(input_characters)),dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_sequence_length_target, len(target_characters)),dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_sequence_length_target, len(target_characters)),dtype='float32')

In [7]:
#Populating the araray
for i, (input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i,t,input_token_index[char]] =1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0 
    for t, char in enumerate(target_text):
        decoder_input_data[i,t,target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i,t-1,target_token_index[char]] =1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

### Building the model (Using Encoder and Decoder Architecture)

In [10]:
#Defining the encoder
import tensorflow as tf
encoder_input = tf.keras.Input(shape=(None, len(input_characters)))
encoder = tf.keras.layers.LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_input)

encoder_state = [state_h, state_c]

#defining the decoder
decoder_input = tf.keras.Input(shape=(None, len(target_characters)))
decoder = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
decoder_output, _,_ = decoder(decoder_input,initial_state=encoder_state)
decoder_dense = tf.keras.layers.Dense(len(target_characters),activation ='softmax')
decoder_output = decoder_dense(decoder_output)

model = tf.keras.Model([encoder_input, decoder_input],decoder_output)

In [11]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 76)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 103)]  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 340992      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  368640      input_2[0][0]                    
                                                                 lstm[0][1]            

### Training the model

In [12]:
#defining the model checkpoint
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
checkpoint = ModelCheckpoint("model-{val_loss:.2f}.h5", monitor="val_loss", verbose=1,mode='min', save_best_only=True, save_weights_only=True)
stop = EarlyStopping(monitor="val_loss", patience=15, mode='min')
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.125, patience=6, min_lr=1e-6, verbose=1, mode='min')

In [13]:
# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ["accuracy"])
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32,
          epochs=50,
          validation_split=0.2,callbacks=[checkpoint,stop,reduce_lr])

Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.77528, saving model to model-0.78.h5
Epoch 2/50
Epoch 00002: val_loss improved from 0.77528 to 0.67676, saving model to model-0.68.h5
Epoch 3/50
Epoch 00003: val_loss improved from 0.67676 to 0.61641, saving model to model-0.62.h5
Epoch 4/50
Epoch 00004: val_loss improved from 0.61641 to 0.57731, saving model to model-0.58.h5
Epoch 5/50
Epoch 00005: val_loss improved from 0.57731 to 0.54071, saving model to model-0.54.h5
Epoch 6/50
Epoch 00006: val_loss improved from 0.54071 to 0.51370, saving model to model-0.51.h5
Epoch 7/50
Epoch 00007: val_loss improved from 0.51370 to 0.48733, saving model to model-0.49.h5
Epoch 8/50
Epoch 00008: val_loss improved from 0.48733 to 0.46532, saving model to model-0.47.h5
Epoch 9/50
Epoch 00009: val_loss improved from 0.46532 to 0.45259, saving model to model-0.45.h5
Epoch 10/50
Epoch 00010: val_loss improved from 0.45259 to 0.44209, saving model to model-0.44.h5
Epoch 11/50
Epoch 00011: val_loss

In [14]:
#Saving the mmodel
model.save('model.h5')

In [15]:
model = tf.keras.models.load_model('model.h5')

### Inferece Stage

In [16]:
# Manually deriving the decoder states and decoder outputs so that we can reuse decoder outputs as inputs for next step
encoder_model = tf.keras.Model(encoder_input, encoder_state)

decoder_state_input_h = tf.keras.Input(shape=(256,))
decoder_state_input_c = tf.keras.Input(shape=(256,))

decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
decoder_output, state_h, state_c = decoder(decoder_input, initial_state=decoder_state_input)
decoder_state = [state_h, state_c]
decoder_output = decoder_dense(decoder_output)

decoder_model = tf.keras.Model([decoder_input]+decoder_state_input,[decoder_output] + decoder_state)

In [13]:
# Mapping between integers or indexes and characters.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [11]:
#decoding the output sentence from tokenized data
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    #generating empty target sequence of length 1
    target_seq = np.zeros((1,1,len(target_characters)))
    #Adding the start character in the target sequence
    target_seq[0,0,target_token_index["\t"]] =1.0

    stop_con = False
    decoded_sentence = ""

    while not stop_con:
        decoder_output, state_h, state_c = decoder_model.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(decoder_output[0,-1,:])
        sample_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sample_char

    
        # Exit condition: either hit max length
        # or find stop character.
        if sample_char == "\n" or len(decoded_sentence) > max_sequence_length_target:
            stop_con = True
    
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1,len(target_characters)))
        target_seq[0, 0, sampled_token_index] = 1.0
    
        # Update states
        states_value = [state_h, state_c]

    return decoded_sentence

In [30]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: Go.
Decoded sentence: Allez !

-
Input sentence: Go.
Decoded sentence: Allez !

-
Input sentence: Go.
Decoded sentence: Allez !

-
Input sentence: Go.
Decoded sentence: Allez !

-
Input sentence: Hi.
Decoded sentence: Salut.

-
Input sentence: Hi.
Decoded sentence: Salut.

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run!
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !

-
Input sentence: Run.
Decoded sentence: Fuyez !



In [142]:
encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')

In [17]:
encoder_model = tf.keras.models.load_model('encoder_model.h5')
decoder_model = tf.keras.models.load_model('decoder_model.h5')



In [38]:
text = "Go."
input_data = np.zeros((1, max_sequence_length_input, len(input_characters)),dtype='float32')
for t , char in enumerate(text):
    input_data[0,t, input_token_index[char]] = 1.0
input_data[0, t+1:, input_token_index[" "]] = 1.0

In [39]:
print(decode_sequence(input_data[0:1]))

Allez !



In [41]:
#prediction for any random text
def eng_to_french(text):
    input_data = np.zeros((1,max_sequence_length_input, len(input_characters)),dtype='float32')
    for t, char in enumerate(text):
        input_data[0,t,input_token_index[char]] = 1.0
    input_data[0, t+1:, input_token_index[" "]] =1.0
    return print(decode_sequence(input_data[0:1]))

In [49]:
eng_to_french("Help me")

Aide-moi !

