<a href="https://colab.research.google.com/github/kunal-kumar-chaudhary/Machine-Translation-/blob/main/Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [None]:
!!curl -O http://www.manythings.org/anki/fra-eng.zip
!!unzip fra-eng.zip

['Archive:  fra-eng.zip',
 '  inflating: _about.txt              ',
 '  inflating: fra.txt                 ']

In [None]:
batch_size = 64
epochs = 100
latent_dim = 256 # number of dimension to encode the inputs in
num_samples = 10000 # number of samples to train on
data_path = "fra.txt"

In [None]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path, "r", encoding="utf-8") as f:
  lines = f.read().split("\n")
for line in lines[: min(num_samples, len(lines) - 1)]:
  input_text, target_text, _ = line.split("\t")
  # now we use "tab" as the start sequence character
  # for the targets, and "\n" as "end sequence" character.
  target_text = "\t" + target_text + "\n"
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char) 
       

In [None]:
print(input_characters)

{'g', 'i', 'K', 'Q', '1', '$', '5', 'l', 't', 'x', '7', 's', ',', "'", 'u', 'é', 'S', ':', '&', '0', 'z', '!', 'f', '?', 'e', 'I', 'E', 'Y', '8', 'o', 'H', 'D', 'W', 'k', 'y', 'c', 'C', 'P', '2', ' ', 'U', 'R', 'N', '"', 'A', 'L', 'd', 'h', 'G', 'B', 'T', '.', 'b', 'M', 'j', 'V', 'm', 'O', '3', 'w', 'q', 'p', 'a', '9', 'r', 'v', '%', '-', 'J', 'n', 'F'}


In [None]:
print(target_characters)

{'à', 'K', '1', '’', "'", '&', 'z', '(', 'H', 'y', 'R', 'N', 'h', '\u202f', 'G', 'B', '3', 'r', 'J', 'Ê', '\n', '«', 'Ç', '5', 't', 'l', '»', ')', 'u', 'S', ':', 'ô', '0', '8', 'Y', 'o', 'D', 'k', 'W', 'U', 'A', 'ï', 'b', 'œ', 'T', 'V', '.', 'j', 'm', 'p', '%', 'n', 'Q', 'ç', 's', ',', 'f', '\t', 'e', 'E', 'C', '2', ' ', 'â', 'L', 'd', 'é', 'ê', 'M', 'O', 'w', 'q', '9', 'v', 'ù', 'F', 'g', 'i', 'x', 'À', '!', '?', '\xa0', 'I', 'P', 'c', 'î', 'û', 'É', '\u2009', 'a', '-', 'è'}


In [None]:
print(input_texts[:10])

['Go.', 'Go.', 'Go.', 'Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Run!', 'Run!']


In [None]:
print(target_texts[:10])

['\tVa !\n', '\tMarche.\n', '\tEn route !\n', '\tBouge !\n', '\tSalut !\n', '\tSalut.\n', '\tCours\u202f!\n', '\tCourez\u202f!\n', '\tPrenez vos jambes à vos cous !\n', '\tFile !\n']


In [None]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(text) for text in input_texts])
max_decoder_seq_length = max([len(text) for text in target_texts])


In [None]:
print("number of samples: ", len(input_texts))
print("number of unique input tokens: ", num_encoder_tokens)
print("number of unique output tokens: ", num_decoder_tokens)
print("max sequence length for inputs: ", max_encoder_seq_length)
print("max sequence length for outputs: ", max_decoder_seq_length)

number of samples:  10000
number of unique input tokens:  71
number of unique output tokens:  93
max sequence length for inputs:  15
max sequence length for outputs:  59


In [None]:
input_token_index = dict(
    [(char,i) for i, char in enumerate(input_characters)]
)
target_token_index = dict(
    [(char,i) for i, char in enumerate(target_characters)]
)

In [None]:
# input_token_index

In [None]:
# target_token_index

In [None]:
len(input_texts)

10000

In [None]:
# one hot representation of our data
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
     dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
     dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype="float32"
)

In [None]:
print(encoder_input_data.ndim)

3


In [None]:
print(encoder_input_data.shape)

(10000, 15, 71)


In [None]:
decoder_input_data.shape

(10000, 59, 93)

In [None]:
decoder_target_data.shape

(10000, 59, 93)

In [None]:
encoder_input_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [None]:
encoder_input_data[0].shape

(15, 71)

In [None]:
encoder_input_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [None]:
# define an input sequence and process it
encoder_inputs = tf.keras.layers.Input(shape=(None, num_encoder_tokens))
encoder = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# we discard "encoder outputs" and only keep the states
encoder_states = [state_h, state_c]

In [None]:
# setup the decoder using encoder states as initial states
decoder_inputs = tf.keras.layers.Input(shape=(None, num_decoder_tokens))
# we set up our decoder to return full output sequences
# and to return internal states as well. we don't use the
# return states in the training model but we will use them in inference
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True )
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
target_seq = np.zeros((1, 1, num_decoder_tokens))

In [None]:
target_seq

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

In [None]:
target_seq[0, 0, target_token_index["\t"]] = 1.0

In [None]:
target_seq

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

In [None]:
# define the model that will turn 
# encoder_input_data and decoder_input_data into decoder_target_data
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# training the model
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=['accuracy'])

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size, epochs=epochs,
          validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fc2703dc610>