# Building translator from scratch using Transformers
---


# Data Processing

### Vocabulary building

In [1]:
import torch
import numpy as np

english_path = "/kaggle/input/samanantar/final_data/en-or/train.en"
odia_path = "/kaggle/input/samanantar/final_data/en-or/train.or"
START_TOKEN = "<S>"
END_TOKEN = "</S>"
PADDING_TOKEN = "<P>"

english_vocab = [START_TOKEN,' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', 
               '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', 
               '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@', 
               'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
               'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 
               '[', '\\', ']', '^', '_', '`', 
               'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
               'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
               '{', '|', '}', '~', '–', '‘', '’', '“', '”', '…',
               PADDING_TOKEN, END_TOKEN]

odia_vocab = [START_TOKEN,' ', '!', '"', '#', '$','%', '&', "'", '(', ')', 
              '*','+', ',', '-', '.', '/', '0', '1', '2', '3', 
              '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', '@',
              '[', '\\', ']', '^˚', '_', '{', '|', '}', '~',
              '।', 'ଁ', 'ଂ', 'ଃ', 'ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଏ', 'ଓ', 'ଔ', 
              'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ',
              'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ',
              'ଵ', 'ଶ', 'ଷ', 'ସ', 'ହ', '଼', 'ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'ୄ', 'େ', 'ୈ',
              'ୋ', 'ୌ', '୍', 'ୖ', 'ୗ', 'ଡ଼', 'ଢ଼', 'ୟ', 'ୠ', 'ୡଐ',
              '୦', '୧', '୨', '୩', '୪', '୫', '୬', '୭', '୮', '୯', 'ୱ', '–', '‘',
              '’', '“', '”', '…',
               PADDING_TOKEN, END_TOKEN]
index_to_odia = {k:v for k,v in enumerate(odia_vocab)}
odia_to_index = {v:k for k,v in enumerate(odia_vocab)}
index_to_english = {k:v for k,v in enumerate(english_vocab)}
english_to_index = {v:k for k,v in enumerate(english_vocab)}
TOTAL_SENTENCES = 200000


### Data retrival and study

In [2]:
with open(english_path,'r') as file:
    english_sentences = file.readlines()
with open(odia_path,'r') as file:
    odia_sentences = file.readlines()
    
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences[:TOTAL_SENTENCES]]
odia_sentences = [sentence.rstrip("\n") for sentence in odia_sentences[:TOTAL_SENTENCES]]


In [3]:
print(*english_sentences[:10])
print(*odia_sentences[:10])

Both of them died on spot. The deceased include one female and two male. There was a tense in the hospital premises. Police have arrested the woman's husband and her brother. XFS (version %s) Villagers live in terror due to the tremors. What is the order Jaish-e-Mohammad claimed responsibility for the blast. At that time,it was the largest. Friends, Im really very happy to connect with you people today.
ଏହାଫଳରେ ଉଭୟଙ୍କର ଘଟଣାସ୍ଥଳରେ ମୃତ୍ୟୁ ଘଟିଥିଲା। ମୃତ ଶ୍ରମିକଙ୍କ ମଧ୍ୟରେ ଦୁଇ ଜଣ ମହିଳା ଓ ଜଣେ ପୁରୁଷ ଅଛନ୍ତି । ଡାକ୍ତରଖାନା ପରିସର ଅଶାନ୍ତ ହୋଇ ପଡ଼ିଥିଲା। ପୁଲିସ୍‌ ମହିଳାଙ୍କ ସମେତ ତାଙ୍କ ସ୍ୱାମୀ ଓ ଶଶୁରଙ୍କୁ ବି ଗିରଫ କରିଛି। NTFS (ସଂସ୍କରଣ %s) ହାତୀପଲଙ୍କ ଉତ୍ପାତ ଯୋଗୁଁ ଗ୍ରାମବାସୀମାନେ ଆତଙ୍କିତ ଅବସ୍ଥାରେ ରହିଛନ୍ତି। କଣ ରହିଛି ନିର୍ଦ୍ଦେଶ ବିସ୍ଫୋରଣ ପାଇଁ ନିଜକୁ ଦାୟୀ କଲା ଜୈଶ୍-ଏ-ମହମ୍ମଦ । ଯାହା ସେ ସମୟରେ ସବୁଠାରୁ ବଡ ଥିଲା । ବନ୍ଧୁଗଣ ସତରେ ଆଜି ଆପଣମାନଙ୍କ ସହିତ ମିଶି ମତେ ବହୁତ ଖୁସି ଲାଗୁଛି ।


In [4]:
max(len(x) for x in english_sentences), max(len(x) for x in odia_sentences)

(1562, 1361)

In [5]:
import numpy as np
np.percentile([len(x) for x in english_sentences],98), np.percentile([len(x) for x in odia_sentences],98)

(237.0, 227.0)

thus we set the max sequence lenght to 250

### Dataset Creation

In [6]:
from torch.utils.data import Dataset, DataLoader
max_seq_length = 250
batch_size = 32

def is_valid_token(sentence,vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True
def is_valid_length(sentence, max_seq_length=max_seq_length):
    return len(list(sentence)) < max_seq_length-1
valid_sentence_indices = list()
for index in range(len(odia_sentences)):
    odia_sentence, english_sentence = odia_sentences[index],english_sentences[index]
    if is_valid_length(odia_sentence) and is_valid_length(english_sentence) and is_valid_token(odia_sentence,odia_to_index):
        valid_sentence_indices.append(index)
print("length of valid indices",len(valid_sentence_indices))

odia_sentences = [odia_sentences[i] for i in valid_sentence_indices]
english_sentences = [english_sentences[i] for i in valid_sentence_indices]


class EnglishOdiaDataset(Dataset):
    def __init__(self,english_sentences, odia_sentences):
        self.english_sentences = english_sentences
        self.odia_sentences = odia_sentences
    
    def __len__(self):
        return len(self.english_sentences)
    
    def __getitem__(self,index):
        return self.english_sentences[index],self.odia_sentences[index]
    
dataset = EnglishOdiaDataset(english_sentences, odia_sentences)
print(f'length of dataset being {len(dataset)}')

train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

length of valid indices 173466
length of dataset being 173466


### tokenizing and encoding

In [7]:
def tokenizer(sentence, language_to_index, start_token = True, end_token = True):
    result = [language_to_index[token] for token in list(sentence)]
    if start_token: result.insert(0,language_to_index[START_TOKEN])
    if end_token: result.append(language_to_index[END_TOKEN])
    for _ in range(max(len(result),max_seq_length)):
        result.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(result)
    

### creating mask

In [8]:
# NEG_INFTY = -1e9

# def create_masks(eng_batch,odia_batch):# 30 x 250 x 512
#     # creating mask matrix
#     num_sentences = len(eng_batch) # 30
#     look_ahead_mask = torch.triu(torch.full((max_seq_length, max_seq_length),fill_value = True), diagonal = 1) # 250 x 250 masking on attention values with lower tyriangular matrix all 0s
#     encoder_padding_mask = torch.full((num_sentences, max_seq_length, max_seq_length), False) # 30 x 250 x 250 ; on entire batch all at first set to False
#     decoder_padding_mask_self_attention = torch.full((num_sentences, max_seq_length, max_seq_length), False) # 30 x 250 x 250 ; on entire batch all at first set to false
#     decoder_padding_mask_cross_attention = torch.full((num_sentences, max_seq_length, max_seq_length), False) # 30 x 250 x 250 ; on entire batch all at first set to false

#     # setting up masking values
#     for idx in range(num_sentences): # idx in range(0 to 30)
#         eng_sentence_length, odia_sentence_length = len(eng_batch[idx]),len(odia_batch[idx])
#         eng_chars_padded = np.arange(eng_sentence_length+1, max_seq_length) # sentence end to 250
#         odia_chars_padded = np.arange(odia_sentence_length+1, max_seq_length)
#         # for encoder and decoder padding of attention values matrix we have to mask padding tokens in both rows and columns
#         encoder_padding_mask[idx,:,eng_chars_padded] = True # we use it for the row
#         encoder_padding_mask[idx,eng_chars_padded,:] = True # we use it in the columns 
#         decoder_padding_mask_self_attention[idx,:,odia_chars_padded] = True
#         decoder_padding_mask_self_attention[idx,odia_chars_padded,:] = True
#         decoder_padding_mask_cross_attention[idx,:,eng_chars_padded] = True
#         decoder_padding_mask_cross_attention[idx,odia_chars_padded,:] = True
        
#     encoder_self_attention_mask = torch.where(encoder_padding_mask,NEG_INFTY,0)
#     decoder_self_attention_mask = torch.where(look_ahead_mask+decoder_padding_mask_self_attention, NEG_INFTY, 0)
#     decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention,NEG_INFTY,0)
#     return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask   



In [9]:
NEG_INFTY = -1e9

def create_masks(eng_batch, odia_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_seq_length, max_seq_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_seq_length, max_seq_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_seq_length, max_seq_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_seq_length, max_seq_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, odia_sentence_length = len(eng_batch[idx]), len(odia_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_seq_length)
      odia_chars_to_padding_mask = np.arange(odia_sentence_length + 1, max_seq_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, odia_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, odia_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, odia_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

# Transformer Utility


In [10]:
import torch
import torch.nn as nn

import math


def scaled_dot_product_attention(q,k,v,mask):
    # q,k,v = 30 x 8 x 250 x 64
    d_k = q.shape[-1] # 64
    scaled = torch.matmul(q,k.transpose(-1,-2))/math.sqrt(d_k) # q = 30 x 8 x 250 x 64; k.T = 30 x 8 x 64 x 250 ; result = 30 x 8 x 250 x 250; it is like each word in row vs in col.
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = torch.softmax(scaled,dim=-1) # 30 x 8 x 250 x 250; no change in shape
    values = torch.matmul(attention,v) # attention = 30 x 8 x 250 x 250 ; v = 30 x 8 x 250 x 64 ; values = 30 x 8 x 250 x 64 ; basic matrix multiplication accross last two dim
    return values,attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model,num_heads) -> None:
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model # 512
        self.num_heads = num_heads # 8
        self.head_dim = d_model//num_heads # 512/8 = 64
        self.qkv_layer = nn.Linear(d_model,3*d_model) # input = 512 ; output = 1536 i.e. each of the 512 dim input word is projected along 3 planes to get q k and v each of dim 512
        self.linear_layer = nn.Linear(d_model,d_model)
    def forward(self,x, mask = None):
        batch_size,seq_length, d_model = x.shape #30, 250, 512
        qkv = self.qkv_layer(x) # 30 x 250 x 1536
        qkv = qkv.view(batch_size,seq_length,self.num_heads,self.head_dim*3)# 30 x 250 x 8 x 192 ; as each head now will be of 64 dim for each of the q k and v thus 64x3 = 192
        qkv = qkv.permute(0,2,1,3)# 30 x 8 x 250 x 192 ; so that we can have division for each head 
        q,k,v = torch.chunk(input=qkv,chunks=3,dim=-1) # 30 x 8 x 250 x 64 each ; division for q k and v
        values, attention = scaled_dot_product_attention(q,k,v,mask) # attention = 30 x 8 x 250 x 250 ; values = 30 x 8 x 250 x 64; note the values is self attended values tensor
        values = values.view(batch_size,seq_length,self.num_heads*self.head_dim) # 30 x 250 x 512 ; concatenating values from all the 8 heads
        out = self.linear_layer(values)
        return out

class CrossMultiHeadAttention(nn.Module):
    def __init__(self, d_model,num_heads) -> None:
        super(CrossMultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model//num_heads
        self.kv_layer = nn.Linear(d_model,2*d_model)
        self.q_layer = nn.Linear(d_model,d_model)
        self.linear_layer = nn.Linear(d_model,d_model)
    def forward(self,x,y,mask = None):
        batch_size, seq_length, d_model = x.size() # 30, 250, 512
        kv = self.kv_layer(x) # 30 x 250 x 1024
        q = self.q_layer(y) # 30 x 250 x 512
        kv = kv.view(batch_size,seq_length,self.num_heads,2*self.head_dim) # 30 x 250 x 8 x 128 ; as 1024 // 8 = 128
        q = q.view(batch_size,seq_length,self.num_heads, self.head_dim) # 30 x 250 x 8 x 64
        kv = kv.permute(0,2,1,3) # 30 x 8 x 250 x 128
        q = q.permute(0,2,1,3) # 30 x 8 x 250 x 64
        k, v = torch.chunk(input=kv,chunks=2,dim=-1) # 30 x 8 x 250 x 64 each
        values, attention = scaled_dot_product_attention(q,k,v,mask=None) # attention = 30 x 8 x 250 x 250 ; values = 30 x 8 x 250 x 64; note the values is self attended values tensor
        values = values.view(batch_size,seq_length,self.num_heads*self.head_dim) # 30 x 250 x 512 ; concatenating values from all the 8 heads
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameter_shape, eps = 1e-5) -> None:
        super(LayerNormalization,self).__init__()
        self.parameter_shape = parameter_shape # [512]
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameter_shape))
        self.beta = nn.Parameter(torch.zeros(parameter_shape))

    def forward(self,x): # x: 30 x 250 x 512
        dims = [-(i+1) for i in range(len(self.parameter_shape))]
        mean = torch.mean(x,dim=dims, keepdim=True) # 30 x 250 x 1; keepdim=True -> as we need 30 x 250 x 1 rather than just 30 x 250
        std = torch.std(x,dim=dims,keepdim=True) # 30 x 250 x 1; keepdim=True -> as we need 30 x 250 x 1 rather than just 30 x 250
        y = (x-mean) / std+self.eps # 30 x 250 x 512
        out = self.gamma*y + self.beta # 30 x 250 x 512
        return out
    
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_neurons, drop_prob) -> None:
        super(FeedForward,self).__init__()
        self.linear1 = nn.Linear(d_model,hidden_neurons) # 512 ; 2048
        self.linear2 = nn.Linear(hidden_neurons,d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
    def forward(self,x): # 30 x 250 x 512
        x = self.linear1(x) # 30 x 250 x 2048
        x = self.relu(x) # 30 x 250 x 2048
        x = self.dropout(x) # 30 x 250 x 2048
        x = self.linear2(x) # 30 x 250 x 512
        return x


# Positional Encoding

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model, max_seq_length):
        super(PositionalEncoding,self).__init__()
        self.max_seq_length = max_seq_length
        self.d_model = d_model
    def forward(self):
        even_i = torch.arange(0,self.d_model,2).float() # 256
        denominator = torch.pow(10000,even_i/self.d_model) # 256
        position = torch.arange(self.max_seq_length).view(self.max_seq_length,1) # 250 x 1
        even_pos = torch.sin(position/denominator) # 250 x 256
        odd_pos = torch.cos(position/denominator) # 250 x 256
        stacked = torch.stack([even_pos, odd_pos], dim=2) # intereaving ; 250 x 256 x 2
        pos_enc = torch.flatten(stacked, start_dim=1,end_dim=2) # flattening last dim to concat; 250 x 512
        return pos_enc

# Sentence Embedding

In [12]:
class SentenceEmbedding(nn.Module):
    def __init__(self,max_seq_length,d_model, START_TOKEN , END_TOKEN, PADDING_TOKEN, language_to_index):
        super(SentenceEmbedding,self).__init__()
        self.vocab_size = len(language_to_index)
        self.max_seq_length = max_seq_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_seq_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    def batch_tokenize(self,batch,start_token=True, end_token=True):
        def tokenize(sentence, start_token = True, end_token = True):
            sentence_word_indices = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indices.insert(0,self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indices),self.max_seq_length):
                sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indices)
        tokenized = list()
        for sentence_num in range(len(batch)):
            tokenized.append(tokenize(batch[sentence_num],start_token,end_token))
        tokenized = torch.stack(tokenized)
        return tokenized.to(self.device)
    def forward(self,x, start_token = False, end_token = True):
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(self.device)
        x = self.dropout(x+pos)
        return x

# Transformers

### encoder

In [13]:

class EncoderLayer(nn.Module):
    def __init__(self, d_model,ffn_neurons,drop_prob,num_heads) -> None:
        super(EncoderLayer,self).__init__()
        self.num_heads = num_heads
        self.drop_prob = drop_prob
        self.ffn_neurons = ffn_neurons
        self.d_model = d_model
        self.mlh = MultiHeadAttention(num_heads = num_heads, d_model = d_model)
        self.dropout = nn.Dropout(p = self.drop_prob)
        self.layer_norm_first = LayerNormalization(parameter_shape = [d_model])
        self.layer_norm_second = LayerNormalization(parameter_shape = [d_model])
        self.ffn = FeedForward(d_model = d_model, hidden_neurons = ffn_neurons, drop_prob = drop_prob)

    def forward(self,x,mask = None):
        # note that x.shape == d_model
        x_copy = x.clone()
        x = self.dropout(self.mlh(x,mask = mask))
        x = self.layer_norm_first(x+x_copy)
        x_copy = x.clone()
        x = self.dropout(self.ffn(x))
        x = self.layer_norm_second(x+x_copy)
        return x
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x
class Encoder(nn.Module):
    def __init__(self,d_model,num_heads,ffn_neurons,drop_prob,max_seq_length,language_to_index,START_TOKEN,PADDING_TOKEN, END_TOKEN, num_layers=6):
        super(Encoder,self).__init__()
        self.embedding = SentenceEmbedding(max_seq_length = max_seq_length, d_model = d_model, language_to_index = language_to_index, START_TOKEN = START_TOKEN , END_TOKEN = END_TOKEN, PADDING_TOKEN = PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model,ffn_neurons,drop_prob,num_heads) for _ in range(num_layers)])
    def forward(self,x,self_attention_mask, start_token, end_token):
        x = self.embedding(x, start_token, end_token)
        x = self.layers(x,self_attention_mask)
        return x

### decoder

In [14]:

class DecoderLayer(nn.Module):
    def __init__(self, d_model,ffn_neurons,drop_prob,num_heads) -> None:
        super(DecoderLayer,self).__init__()
        self.num_heads = num_heads
        self.drop_prob = drop_prob
        self.ffn_neurons = ffn_neurons
        self.d_model = d_model
        self.mlh = MultiHeadAttention(num_heads = num_heads, d_model = d_model)
        self.cmlh = CrossMultiHeadAttention(d_model = d_model,num_heads = num_heads)
        self.dropout = nn.Dropout(p = self.drop_prob)
        self.layer_norm_first = LayerNormalization(parameter_shape = [d_model])
        self.layer_norm_second = LayerNormalization(parameter_shape = [d_model])
        self.layer_norm_third = LayerNormalization(parameter_shape = [d_model])
        self.ffn = FeedForward(d_model = d_model, hidden_neurons = ffn_neurons, drop_prob = drop_prob)

    def forward(self,x, y, self_attention_mask, cross_attention_mask):
        # note that x.shape == d_model
        y_copy = y.clone()
        y = self.dropout(self.mlh(y,mask=self_attention_mask))
        y = self.layer_norm_first(y+y_copy)
        y_copy = y.clone()
        y = self.dropout(self.cmlh(x,y,mask=cross_attention_mask))
        y = self.layer_norm_second(y+y_copy)
        y_copy = y.clone()
        y = self.dropout(self.ffn(y))
        y = self.layer_norm_third(y+y_copy)
        return y
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y
class Decoder(nn.Module):
    def __init__(self,d_model,num_heads,ffn_neurons,drop_prob,language_to_index,START_TOKEN, PADDING_TOKEN, END_TOKEN, max_seq_length, num_layers=6):
        super(Decoder,self).__init__()
        self.embedding = SentenceEmbedding(max_seq_length = max_seq_length, d_model = d_model, language_to_index = language_to_index, START_TOKEN = START_TOKEN, END_TOKEN = END_TOKEN, PADDING_TOKEN = PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model,ffn_neurons,drop_prob,num_heads) for _ in range(num_layers)])
    def forward(self,x,y, self_attention_mask, cross_attention_mask, start_token, end_token):
        '''
        x: 30 x 250 x 512
        y: 30 x 250 x 512
        mask: 250 x 250
        '''
        y = self.embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

### transformer

In [15]:
class Transformer(nn.Module):
    def __init__(self,
                    english_to_index,
                    odia_to_index,
                    START_TOKEN,
                    END_TOKEN,
                    PADDING_TOKEN,
                    odia_vocab_size,
                    d_model = 512,
                    num_heads = 8,
                    drop_prob = 0.1,
                    batch_size = 32,
                    max_seq_length = 250 ,
                    ffn_neurons = 2048,
                    num_layers = 5,
                    
                ):
        super(Transformer,self).__init__()
        self.encoder = Encoder(d_model=d_model,num_heads=num_heads,ffn_neurons=ffn_neurons,drop_prob= drop_prob,num_layers=num_layers,max_seq_length=max_seq_length,language_to_index=english_to_index, START_TOKEN = START_TOKEN, PADDING_TOKEN = PADDING_TOKEN, END_TOKEN = END_TOKEN)
        self.decoder = Decoder(d_model=d_model,num_heads=num_heads,ffn_neurons=ffn_neurons,drop_prob= drop_prob,num_layers=num_layers,max_seq_length=max_seq_length,language_to_index=odia_to_index, START_TOKEN = START_TOKEN, PADDING_TOKEN = PADDING_TOKEN, END_TOKEN = END_TOKEN)
        self.linear = nn.Linear(d_model,odia_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        
    def forward(self,x, y,
               encoder_self_attention_mask = None,
               decoder_self_attention_mask = None,
               decoder_cross_attention_mask = None,
               enc_start_token = False,
               enc_end_token = False,
               dec_start_token = False,
               dec_end_token = False
               ):
        x = self.encoder(x = x, self_attention_mask = encoder_self_attention_mask, start_token = enc_start_token, end_token = enc_end_token)
        out = self.decoder(x = x, y = y, self_attention_mask = decoder_self_attention_mask, cross_attention_mask = decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out
    
        

# Training 

In [16]:
transformer = Transformer(english_to_index = english_to_index,
                         odia_to_index = odia_to_index,
                         START_TOKEN = '<S>',
                         END_TOKEN = '</S>',
                         PADDING_TOKEN = '<P>',
                         odia_vocab_size = len(odia_vocab))

In [17]:
transformer

Transformer(
  (encoder): Encoder(
    (embedding): SentenceEmbedding(
      (embedding): Embedding(103, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (mlh): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (layer_norm_first): LayerNormalization()
        (layer_norm_second): LayerNormalization()
        (ffn): FeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): EncoderLayer(
        (mlh): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_

In [18]:
# odia_sentence = ("",)
# english_sentence = ("Should we go and order a pizza or dine in?",)
# for word_counter in range(max_seq_length):
#     encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(english_sentence,odia_sentence)
#     predictions = transformer(
#                             x = english_sentence, 
#                             y = odia_sentence, 
#                             encoder_self_attention_mask = encoder_self_attention_mask.to(device),
#                             decoder_self_attention_mask = decoder_self_attention_mask.to(device),
#                             decoder_cross_attention_mask = decoder_cross_attention_mask.to(device),
#                             enc_start_token = False,
#                             enc_end_token = False,
#                             dec_start_token = True,
#                             dec_end_token = False
#                             )
#     next_token_prob_distribution = predictions[0][word_counter]
#     next_token_index = torch.argmax(next_token_prob_distribution).item()
#     next_token = index_to_odia[next_token_index]
#     odia_sentence = (odia_sentence[0]+next_token, )
#     if next_token == END_TOKEN:
#         break
#     print(f"Evaluation translation (Should we go and order a pizza or dine in?):{odia_sentence}")

In [19]:
loss_criterion = nn.CrossEntropyLoss(ignore_index = odia_to_index[PADDING_TOKEN], reduction = 'none')
for params in transformer.parameters():
    if params.dim()>1:
        nn.init.xavier_uniform_(params)
optim = torch.optim.NAdam(transformer.parameters(), lr = 1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.devicex('cpu')


transformer.train()
transformer.to(device)
total_loss = 10
num_epochs = 10

for epoch in range(num_epochs):
    print(f'Epoch -> {epoch}')
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, odia_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch,odia_batch)
        optim.zero_grad() # resets the gradients of the model parameters
        odia_predictions = transformer(
                                        x = eng_batch, 
                                        y = odia_batch, 
                                        encoder_self_attention_mask = encoder_self_attention_mask.to(device),
                                        decoder_self_attention_mask = decoder_self_attention_mask.to(device),
                                        decoder_cross_attention_mask = decoder_cross_attention_mask.to(device),
                                        enc_start_token = False,
                                        enc_end_token = False,
                                        dec_start_token = True,
                                        dec_end_token = False
                                       )
        true_odia_labels = transformer.decoder.embedding.batch_tokenize(batch = odia_batch,start_token = False, end_token = False)
        loss = loss_criterion(
                                odia_predictions.view(-1,len(odia_vocab)).to(device), # flattens the tensor to single dimension
                                true_odia_labels.view(-1).to(device) # flattens to single dimension
                             ).to(device)
        valid_indices = torch.where(true_odia_labels.view(-1)==odia_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indices.sum() # average loss per valid token in the batch
        loss.backward() # backward propagation of loss
        optim.step() # modifying weights based on gradient
        
        # printing loss credentials
        if batch_num %100 == 0:
            print(f"Iteration number ->{batch_num}, Loss -> {loss.item()}")
            print(f"English:-{eng_batch[0]}")
            print(f"True Odia:-{odia_batch[0]}")
            odia_predicted_sentence = torch.argmax(odia_predictions[0],axis=1)
            predicted_sentence = ''
            for idx in odia_predicted_sentence:
                if idx == odia_to_index[END_TOKEN]:
                    break
                predicted_sentence+=index_to_odia[idx.item()]
            print(f"Predicted Odia:-{predicted_sentence}")
            
            # testing on a random sentence to check accuracy 
            transformer.eval()
            odia_sentence = ("",)
            english_sentence = ("Should we go and order a pizza or dine in?",)
            for word_counter in range(max_seq_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(english_sentence,odia_sentence)
                predictions = transformer(
                                        x = english_sentence, 
                                        y = odia_sentence, 
                                        encoder_self_attention_mask = encoder_self_attention_mask.to(device),
                                        decoder_self_attention_mask = decoder_self_attention_mask.to(device),
                                        decoder_cross_attention_mask = decoder_cross_attention_mask.to(device),
                                        enc_start_token = False,
                                        enc_end_token = False,
                                        dec_start_token = True,
                                        dec_end_token = False
                                        )
                next_token_prob_distribution = predictions[0][word_counter]
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_odia[next_token_index]
                odia_sentence = (odia_sentence[0]+next_token, )
                if next_token == END_TOKEN:
                    break
            print(f"Evaluation translation (Should we go and order a pizza or dine in?):{odia_sentence}")
            print("_______________________________________________________________________________________")
                
           
            
            
        
        
        
        

Epoch -> 0
Iteration number ->0, Loss -> 5.8956708908081055
English:-Both of them died on spot.
True Odia:-ଏହାଫଳରେ ଉଭୟଙ୍କର ଘଟଣାସ୍ଥଳରେ ମୃତ୍ୟୁ ଘଟିଥିଲା।
Predicted Odia:-୩:/:<"ଞ୪ଞଞଞ""ଧଞଞଞଞଧଞଞଞ"ଧଠଧଧଧ"ଞଞ7777.7"ଗୋ"୦ଗୋୋ7ୋୋୋୋଟ"ୋ3ୋ$ସୋସସ”ୋୱ–––/–ସ–ୱ୧&ସ4–ୱ–ି–––ି&–––ସ–––––ଳ]]]]]]]]]]]]]"–]3]]]]]]]]]]ଃ]]]ସସଡସୱ–୪–୪ଢ–ସ୩୪୪ସ।୩୪ସଡସସ&ସସସ୪ଡ।୩ସଔସହ୨ହହହହ’ଔ୨ହ’ବଔଞଞଞଞଞଔଞହ୨୨ହହ]]ହହ]##7#ୠଣଣ#ଣ&#ଝୀ##ଣ######ଣ##ଝ#ଣଣ୪ଞ^˚ହ""ହ&ହ\ଖହ୧"୫ହହଃହ]ଗ&ହହହ\&(^˚"ହ&
Evaluation translation (Should we go and order a pizza or dine in?):('                                                                                                                             ସ୩୩୩୩୩୩୩୩୩୩୩ସସସ୩୩୩୩୩ସସସସ୩୩୩୩୩୩୩ସ{{{{{{{ର ଠଠଠିଠ ି  {    ଞଞ{{   ୍ଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣଣ   #       7777ଡଡ""    {{ହଗ66ହହହହହହହହହହହହହ',)
_______________________________________________________________________________________
Iteration number ->100, Loss -> 3.484609842300415
English:-The Centre government has taken various steps for the welfare of all sections of the society.
True Odia:-କେନ୍ଦ

In [None]:
MODEL_SAVE_PATH = '/kaggle/working/'
torch.save(transformer.state_dict(), MODEL_SAVE_PATH)