<a href="https://colab.research.google.com/github/imancn/nlp/blob/main/machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# English to Persian Translation 

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import torch
import random
import re
import unicodedata
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Pre Processing

In [3]:
def preProcessing(w):
    word = ''.join(c for c in unicodedata.normalize('NFD', word.lower().strip())
                   if unicodedata.category(c) != 'Mn')
    word = re.sub(r"([?.!,¿])", r" \1 ", w)
    word = re.sub(r'[" "]+', " ", w)
return '<start> ' + word.strip() + ' <end>'

In [4]:
def create_dataset():
  fileEN = "/content/drive/MyDrive/Colab Notebooks/NLP/machin-translation/content/TEP.en-fa.en"
  fileFA = "/content/drive/MyDrive/Colab Notebooks/NLP/machin-translation/content/TEP.en-fa.fa"
  enList = []
  faList = []
  count = 0
  with open(fileEN,"r") as fen:
    for line in fen:
      enList.append(preProcessing(line))
      count += 1
      if (count > 150000) : 
        break
  count = 0
  with open(fileFA,"r") as ffa:
    for line in ffa:
      faList.append(preProcessing(line))
      count += 1
      if (count > 150000) : 
        break
  return faList, enList


# tokenize

In [5]:
import tensorflow as tf

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(lang_tokenizer.texts_to_sequences(lang), padding='post')
    return tensor, lang_tokenizer

In [6]:
def load_dataset():
    targ_lang, inp_lang = create_dataset()
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

# Create dataset (target language: Farsi, input language: English)

In [7]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [8]:
!pip3 install sklearn

from sklearn.model_selection import train_test_split

# Create training and validation sets using an 80/20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
120000 120000 30001 30001


# Show the mapping word index and language tokenizer


In [9]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print ("%d -> %s" % (t, lang.index_word[t]))

print ("Input Lang; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Lang; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Lang; index to word mapping
2 -> <start>
11 -> and
102 -> then
32 -> he
568 -> started
342 -> saying
2148 -> nasty
185 -> things
8 -> to
14 -> that
5929 -> stripper
1 -> .
3 -> <end>

Target Lang; index to word mapping
2 -> <start>
99 -> بعد
180 -> شروع
121 -> کرد
7 -> به
5257 -> فحش
454 -> دادن
7 -> به
13 -> اون
8603 -> رقاصه
1 -> .
3 -> <end>


# model parameters


In [10]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 256
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [12]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([256, 39]), TensorShape([256, 35]))

# Encoder

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [14]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (256, 39, 1024)
Encoder Hidden state shape: (batch size, units) (256, 1024)


# Attention Mechanism

In [15]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
attention_layer = Attention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (256, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (256, 39, 1)


# Decoder

In [17]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

In [18]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (256, 51819)


# Initialize optimizer and loss functions

In [19]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def lossFun(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

# training_checkpoints

In [20]:
import os

cp_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/machin-translation/content/training_checkpoints'
cp_prefix = os.path.join(cp_dir, "ckpt")
cp = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

# train step

In [21]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        # <start> token is the initial decoder input
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += lossFun(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

# Start Training

In [22]:
 import time

EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # Call the train method
        batch_loss = train_step(inp, targ, enc_hidden)
        # Compute the loss (per batch)
        total_loss += batch_loss

    # Save (checkpoint)
    if (epoch + 1) % 2 == 0:
        cp.save(file_prefix = cp_prefix)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,total_loss / steps_per_epoch))

    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Loss 1.6320
Time taken for 1 epoch 908.2921917438507 sec

Epoch 2 Loss 1.3534
Time taken for 1 epoch 890.5865161418915 sec

Epoch 3 Loss 1.1964
Time taken for 1 epoch 889.5785422325134 sec

Epoch 4 Loss 1.0470
Time taken for 1 epoch 894.8115491867065 sec

Epoch 5 Loss 0.9070
Time taken for 1 epoch 890.4350519180298 sec

Epoch 6 Loss 0.7785
Time taken for 1 epoch 926.2395641803741 sec

Epoch 7 Loss 0.6659
Time taken for 1 epoch 890.8751950263977 sec

Epoch 8 Loss 0.5738
Time taken for 1 epoch 895.5262868404388 sec

Epoch 9 Loss 0.5015
Time taken for 1 epoch 921.9271593093872 sec

Epoch 10 Loss 0.4388
Time taken for 1 epoch 894.3808171749115 sec



# Evaluate function

In [23]:
import numpy as np

def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))
  # Preprocess the sentence given
  sentence = preProcessing(sentence)

  # Fetch the indices concerning the words in the sentence and pad the sequence
  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
  # Convert the inputs to tensors
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  # Loop until the max_length is reached for the target lang (ENGLISH)
  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '
    # If <end> token is reached, return
    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot
    # The predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

# Translate function

In [24]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Output: {}'.format(result))

# Restore the latest checkpoint

In [25]:
# Restore the latest checkpoint in checkpoint_dir
cp.restore(tf.train.latest_checkpoint(cp_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f88ca7e0e90>

In [30]:
translate("i was very bad")
translate("im at home")
translate("i have a cold")
translate("take me to the tunnel")
translate("go away")
translate("i walk everyday")

Input: <start> i was very bad <end>
Output: من خيلي بد بود . <end> 
Input: <start> im at home <end>
Output: من تو خونه ام . <end> 
Input: <start> i have a cold <end>
Output: يه خورده . <end> 
Input: <start> take me to the tunnel <end>
Output: منو ببر به تونل تونل بگيريد . <end> 
Input: <start> go away <end>
Output: برو . <end> 
Input: <start> i walk everyday <end>
Output: من فردا روز میرم . <end> 
