In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import re
from transformer import Transformer

In [None]:
hindi_file_path = r"/content/new_hindi_data.txt"
english_file_path = r"/content/new_english_data.txt"
with open (english_file_path,"r") as f:
    raw_english_sen = f.readlines()
with open (hindi_file_path,"r") as f:
    raw_hindi_sen = f.readlines()

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [None]:
raw_english_sen[:5],raw_hindi_sen[:5],type(raw_hindi_sen)

(["However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles\n",
  'Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.\n',
  'The value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.\n',
  'Mithali To Anchor Indian Team Against Australia in ODIs\n',
  'After the assent of the Honble President on 8thSeptember, 2016, the 101thConstitutional Amendment Act, 2016 came into existence\n'],
 ['आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।\n',
  'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है\n',
  'जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प्रज

In [None]:
# Function to preprocess the text by removing English words and unwanted parts
def preprocess_text(text):
    # Regular expression to match English words, unwanted characters,
    # and retain Hindi characters, numbers, email addresses, website links, and specified characters
    regex_pattern = r'\b[a-zA-Z]+(?:-[a-zA-Z]+)*\b|[^\u0900-\u097F0-9 !\"#$%&\'()*+,\-./:;<=>?ˌ]|(?<!\S)(\w+@\w+\.\w+|\w+(\.\w+)+)(?!\S)|(?<!\S)(https?://\S+)(?!\S)'

    # Remove English words and unwanted parts
    processed_text = re.sub(regex_pattern, '', text)

    # remove extra spaces
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()

    return processed_text




In [None]:
hindi_sentences = [preprocess_text(sen) for sen in raw_hindi_sen]
english_sentences = [sen.rstrip("\n").lower() for sen in raw_english_sen]
word_hindi = "".join(hindi_sentences)
word_english = "".join(english_sentences)

In [None]:
len(english_sentences),len(hindi_sentences)

(100000, 100000)

In [None]:
hindi_vocab = sorted(set(word_hindi))
english_vocab = sorted(set(word_english))

In [None]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'
hindi_vocab.insert(0,START_TOKEN)
hindi_vocab.append(PADDING_TOKEN)
hindi_vocab.append(END_TOKEN)
english_vocab.insert(0,START_TOKEN)
english_vocab.append(PADDING_TOKEN)
english_vocab.append(END_TOKEN)

In [None]:
len(hindi_vocab),len(english_vocab)

(130, 70)

In [None]:
print(english_vocab),print(hindi_vocab)

['<START>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@', '[', '\\', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '<PADDING>', '<END>']
['<START>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', '्', 'ॐ', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़', 'ॠ', '।', '॥', '०', '१', '२', '३', '४', '५', '६', '७'

(None, None)

In [None]:
hindi_to_index = {k:v for v,k in enumerate(hindi_vocab)}
index_to_hindi = {k:v for k,v in enumerate(hindi_vocab)}
english_to_index = {k:v for v,k in enumerate(english_vocab)}
index_to_english = {k:v for k,v in enumerate(english_vocab)}

In [None]:
def lang_encode(sentence,language_to_index):
    encoded = [language_to_index[char] for char in list(sentence)]
    return encoded

def lang_decode(tokens,index_to_language):
    decoded = "".join([index_to_language[idx] for idx in tokens])
    return decoded

In [None]:
tokens = lang_encode(hindi_sentences[0],hindi_to_index)
print(tokens)

[37, 85, 104, 61, 104, 78, 98, 80, 90, 77, 89, 1, 51, 98, 1, 72, 89, 80, 1, 86, 98, 70, 80, 91, 1, 51, 98, 1, 85, 89, 67, 1, 58, 102, 110, 91, 1, 74, 70, 89, 70, 98, 1, 82, 89, 80, 98, 1, 72, 98, 85, 1, 76, 90, 77, 89, 76, 91, 1, 76, 98, 34, 1, 51, 104, 82, 89, 78, 104, 61, 78, 73, 89, 38, 70, 80, 1, 66, 51, 1, 86, 91, 1, 72, 86, 92, 34, 56, 1, 85, 51, 98, 1, 51, 104, 77, 102, 34, 51, 90, 1, 38, 85, 1, 68, 103, 78, 1, 76, 98, 34, 1, 40, 70, 104, 86, 98, 34, 1, 75, 93, 72, 66, 90, 1, 50, 78, 1, 70, 102, 80, 104, 85, 1, 70, 98, 1, 86, 78, 89, 77, 89, 1, 67, 89, 115]


In [None]:
sen = lang_decode(tokens,index_to_hindi)
print(sen)

आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।


In [None]:
import numpy as np
PERCENTILE = 90
print(f"This means that {PERCENTILE}% of sentences has the following lenght")
print( f"{PERCENTILE}th percentile length hindi: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

This means that 90% of sentences has the following lenght
90th percentile length hindi: 179.0
90th percentile length English: 192.0


In [None]:
eng_sen,hin_sen = english_sentences[0],hindi_sentences[0]

In [None]:
max_seq_len = 200
def valid_tokens(sentences,vocab):
    for tokens in list(set(sentences)):
        if tokens not in vocab:
            return False
        return True

def valid_length(eng_sen,hin_sen,max_seq_len):
    if len(eng_sen) < (max_seq_len-1) and len(hin_sen) < (max_seq_len)-1:
        return True
    return False

valid_sentence_index = []
for idx in range(len(english_sentences)):
    eng_sen,hin_sen = english_sentences[idx],hindi_sentences[idx]
    if valid_length(eng_sen,hin_sen,max_seq_len)\
        and valid_tokens(eng_sen,english_vocab) \
        and valid_tokens(hin_sen,hindi_vocab):
        valid_sentence_index.append(idx)


In [None]:
print(f"Actual number of sentences: {len(english_sentences)}")
print(f"Valid number of sentences: {len(valid_sentence_index)}")

Actual number of sentences: 100000
Valid number of sentences: 89353


In [None]:
hindi_sentences = [hindi_sentences[idx] for idx in valid_sentence_index]
english_sentences = [english_sentences[idx] for idx in valid_sentence_index]

In [None]:
len(hindi_sentences),len(english_sentences)

(89353, 89353)

In [None]:
#Hyper parameters
batch_size = 30   # Each batch will contain 30 sentences
max_sequence_len = 200  # max sentence len will be 200
d_model = 512 # Dimensionality of each char in sequence i.e 200 x 512
number_heads = 8  # Number of attention heads
fnn_hidden = 2048 # Feedforward layer dim
drop_prob = 0.1 # Dropout
num_layer = 5 #number of layers  of encoder
hindi_vocab_len = len(hindi_vocab)
hindi_to_index = hindi_to_index
english_to_index = english_to_index
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

In [None]:
class ParameteresConfig():
    def __init__(self,**kwargs):
        self.batch_size = 30 # batch_size
        self.max_sequence_len = 200 # max_sequence_len
        self.d_model = 512 # d_model
        self.num_heads = 8 # number_heads
        self.fnn_hidden = 2048 #fnn_hidden
        self.drop_prob = 0.1 # drop_prob
        self.num_layer = 5 #num_layer
        self.device = device
        # self.hindi_vocab_len =
        # self.hindi_to_index = hindi_to_index
        # self.english_to_index = english_to_index
        self.START_TOKEN = '<START>'
        self.PADDING_TOKEN = '<PADDING>'
        self.END_TOKEN = '<END>'

        #override the default arguments
        for key,val in kwargs.items():
            setattr(self,key,val)

    def display(self):
        print("parameters are:")
        for key,val in vars(self).items():
            print(f"{key} = {val}")

In [None]:
config = ParameteresConfig(hindi_vocab_len = len(hindi_vocab),
                           hindi_to_index = hindi_to_index,
                           english_to_index = english_to_index,
                           device = device)
config.display()

parameters are:
batch_size = 30
max_sequence_len = 200
d_model = 512
num_heads = 8
fnn_hidden = 2048
drop_prob = 0.1
num_layer = 5
START_TOKEN = <START>
PADDING_TOKEN = <PADDING>
END_TOKEN = <END>
hindi_vocab_len = 130
hindi_to_index = {'<START>': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, '<': 28, '=': 29, '>': 30, '?': 31, 'ˌ': 32, 'ँ': 33, 'ं': 34, 'ः': 35, 'अ': 36, 'आ': 37, 'इ': 38, 'ई': 39, 'उ': 40, 'ऊ': 41, 'ऋ': 42, 'ऌ': 43, 'ऍ': 44, 'ऎ': 45, 'ए': 46, 'ऐ': 47, 'ऑ': 48, 'ओ': 49, 'औ': 50, 'क': 51, 'ख': 52, 'ग': 53, 'घ': 54, 'ङ': 55, 'च': 56, 'छ': 57, 'ज': 58, 'झ': 59, 'ञ': 60, 'ट': 61, 'ठ': 62, 'ड': 63, 'ढ': 64, 'ण': 65, 'त': 66, 'थ': 67, 'द': 68, 'ध': 69, 'न': 70, 'ऩ': 71, 'प': 72, 'फ': 73, 'ब': 74, 'भ': 75, 'म': 76, 'य': 77, 'र': 78, 'ऱ': 79, 'ल': 80, 'ळ': 81, 'व': 82, 'श': 83, 'ष': 84, 'स

In [None]:
from torch.utils.data import DataLoader,Dataset

class TextDataset(Dataset):
    def __init__(self,english_sentences,hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, index) :
        return self.english_sentences[index],self.hindi_sentences[index]

In [None]:
dataset = TextDataset(english_sentences,hindi_sentences)
len(dataset), dataset[0]

(89353,
 ("however, paes, who was partnering australia's paul hanley, could only go as far as the quarterfinals where they lost to bhupathi and knowles",
  'आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।'))

In [None]:
# 8941/30

In [None]:
train_dataloader = DataLoader(dataset,batch_size,drop_last=True)

In [None]:
sentence_batch = next(iter(train_dataloader))

In [None]:
sentence_batch[0][1],sentence_batch[1][1]

('whosoever desires the reward of the world, with allah is the reward of the world and of the everlasting life. allah is the hearer, the seer.',
 'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है')

In [None]:
neg_inf = -1e9
def mask_creation(eng_batch,hin_batch):
    number_sen = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_len,max_sequence_len],True)
    look_ahead_mask = torch.triu(look_ahead_mask,diagonal=1)
    encod_self_padding_mask = torch.full([number_sen,max_sequence_len,max_sequence_len],False)
    decode_self_padding_mask = torch.full([number_sen,max_sequence_len,max_sequence_len],False)
    decoder_cross_padding_mask = torch.full([number_sen,max_sequence_len,max_sequence_len],False)

    for idx in range(number_sen):
        eng_sen , hin_sen = eng_batch[idx],hin_batch[idx]
        hin_padding_seq = np.arange(len(hin_sen)+1,max_sequence_len)
        eng_padding_seq = np.arange(len(eng_sen)+1,max_sequence_len)
        encod_self_padding_mask[idx,:,eng_padding_seq] = True
        encod_self_padding_mask[idx,eng_padding_seq,:] = True
        decode_self_padding_mask[idx,:,hin_padding_seq] = True
        decode_self_padding_mask[idx,hin_padding_seq,:] = True
        decoder_cross_padding_mask[idx,:,eng_padding_seq] = True
        decoder_cross_padding_mask[idx,hin_padding_seq,:] = True

    encoder_self_att_mask = torch.where(encod_self_padding_mask,neg_inf,0)
    decoder_self_att_mask = torch.where(look_ahead_mask+decode_self_padding_mask,neg_inf,0)
    decoder_cross_att_mask = torch.where(decoder_cross_padding_mask,neg_inf,0)

    return encoder_self_att_mask , decoder_self_att_mask , decoder_cross_att_mask


In [None]:
transformer = Transformer(config=config,english_to_index=english_to_index,hindi_to_index=hindi_to_index)

In [None]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(70, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (in_linear): Linear(in_features=512, out_features=1536, bias=True)
          (out_linear): Linear(in_features=512, out_features=512, bias=True)
        )
        (layernorm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (pos_fnn): PositionwiseFeedForward(
          (in_linear): Linear(in_features=512, out_features=2048, bias=True)
          (out_linear): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layernorm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attent

In [None]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
print(device)

cuda


In [None]:
dta = next(iter(train_dataloader))
dta[0][0]

"however, paes, who was partnering australia's paul hanley, could only go as far as the quarterfinals where they lost to bhupathi and knowles"

In [None]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 2

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_dataloader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, hin_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = mask_creation(eng_batch, hin_batch)
        optim.zero_grad()
        hin_predictions = transformer(eng_batch,
                                     hin_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(hin_batch, start_token=False, end_token=True)
        loss = criterian(
            hin_predictions.view(-1, hindi_vocab_len).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Hindi Translation: {hin_batch[0]}")
            hin_sentence_predicted = torch.argmax(hin_predictions[0], axis=1)
            print(hin_sentence_predicted)
            predicted_sentence = ""
            for idx in hin_sentence_predicted:
              if idx == hindi_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_hindi[idx.item()]
            print(f"Hindi Prediction: {predicted_sentence}")


            transformer.eval()
            hin_sentence = ("",)
            eng_sentence = ("should we go to the mall?",)
            for word_counter in range(max_sequence_len):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= mask_creation(eng_sentence, hin_sentence)
                predictions = transformer(eng_sentence,
                                          hin_sentence,
                                          encoder_self_attention_mask.to(device),
                                          decoder_self_attention_mask.to(device),
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                print(next_token_index)
                next_token = index_to_hindi[next_token_index]
                hin_sentence = (hin_sentence[0] + next_token,)
                if next_token == END_TOKEN:
                  break

            print(f"Evaluation translation (should we go to the mall?) : {hin_sentence}")
            print("-------------------------------------------")


Epoch 0
Iteration 0 : 5.973877429962158
English: however, paes, who was partnering australia's paul hanley, could only go as far as the quarterfinals where they lost to bhupathi and knowles
Hindi Translation: आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।
tensor([  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,  99,
          7,   7,   7,  99,   7,   7,  99,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7, 128,   7,   7,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   7,  99,  99,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
          7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
        128,   7,   

In [None]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  hin_sentence = ("",)
  for word_counter in range(max_sequence_len):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= mask_creation(eng_sentence, hin_sentence)
    predictions = transformer(eng_sentence,
                              hin_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_hindi[next_token_index]
    hin_sentence = (hin_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return hin_sentence[0]

In [None]:
translation = translate("let's go somewhere")
print(translation)

इसके बाद में सकता है।...................................................................................................................................................ा.ा.ा..ा.ा.ा.ी.ी.ा.ी.ी.ी.ी.ी.ी.ी


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_wights_path = "/content/drive/MyDrive/Colab Notebooks/Save_model/Transformer_model.pth"

In [None]:
#Save the wights of modle
torch.save(transformer.state_dict(), model_wights_path)