English to Hindi Machine Translation

In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [None]:
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# !pip install pandas
# !pip install transformers datasets sentencepiece #sacremoses
# # !pip install -qU torchvision
# !pip install matplotlib

In [None]:
# file_path = 'Dataset_English_Hindi.csv'
file_path = '/kaggle/input/hi-en-dataset/Dataset_English_Hindi_new.csv'

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_k, d_model, n_heads,max_len,causal=False):
        super().__init__()
        #Assume d_v=d_k
        self.d_k = d_k
        self.n_heads = n_heads

        self.key = nn.Linear(d_model, d_k * n_heads)
        self.query = nn.Linear(d_model, d_k * n_heads)
        self.value = nn.Linear(d_model, d_k * n_heads)

        #Final linear layer
        self.fc = nn.Linear(d_k * n_heads, d_model)

        self.causal = causal
        if causal:
            cm = torch.tril(torch.ones(max_len,max_len))
            self.register_buffer(
                "causal_mask",
                cm.view(1,1,max_len,max_len)
            )

    def forward(self, q,k,v,pad_mask=None):
        q=self.query(q)
        k=self.key(k)
        v=self.value(v)

        N= q.shape[0]
        T_output = q.shape[1]
        T_input = k.shape[1]

        q=q.view(N,T_output,self.n_heads,self.d_k).transpose(1,2)
        k=k.view(N,T_input,self.n_heads,self.d_k).transpose(1,2)
        v=v.view(N,T_input,self.n_heads,self.d_k).transpose(1,2)

        attn_scores =q @ k.transpose(-2,-1) / math.sqrt(self.d_k)
        if pad_mask is not None:
            attn_scores = attn_scores.masked_fill(
                pad_mask[:,None,None,:] == 0, float('-inf'))

        if self.causal:
            attn_scores = attn_scores.masked_fill(
                self.causal_mask[:,:,:T_output,:T_input] == 0, float('-inf'))
        attn_weights = F.softmax(attn_scores, dim=-1)

        A = attn_weights @ v
        A = A.transpose(1,2)
        A = A.contiguous().view(N,T_output,self.d_k*self.n_heads)

        return self.fc(A)



In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_k,d_model,n_heads, max_len,causal=False)
        self.ann = nn.Sequential(
            nn.Linear(d_model,d_model*4),
            nn.GELU(),
            nn.Linear(d_model*4,d_model),
            nn.Dropout(dropout_prob),
        )
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self,x,pad_mask=None):
        x = self.ln1(x+self.mha(x,x,x,pad_mask))
        x = self.ln2(x+self.ann(x))
        x = self.dropout(x)
        return x

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
        super().__init__()

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
        self.mha1 = MultiHeadAttention(d_k,d_model,n_heads, max_len,causal=True)
        self.mha2 = MultiHeadAttention(d_k,d_model,n_heads, max_len,causal=False)
        self.ann = nn.Sequential(
            nn.Linear(d_model,d_model*4),
            nn.GELU(),
            nn.Linear(d_model*4,d_model),
            nn.Dropout(dropout_prob),
        )
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self,enc_output,dec_input,enc_mask=None, dec_mask=None):

        x = self.ln1(
            dec_input + self.mha1(dec_input,dec_input,dec_input,dec_mask))
        #MHA inclusing encoder output
        x = self.ln2(x + self.mha2(x,enc_output,enc_output,enc_mask))
        x = self.ln3(x + self.ann(x))
        x = self.dropout(x)
        return x

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048,dropout_prob=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_prob)

        position = torch.arange(max_len).unsqueeze(1)
        exp_term = torch.arange(0,d_model,2)
        div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
        pe = torch.zeros(1,max_len,d_model)
        pe[0, : ,0::2] = torch.sin(position * div_term)
        pe[0, : ,1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe)

    def forward(self,x):
        x=x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 vocab_size,
                 max_len,
                 d_k,
                 d_model,
                 n_heads,
                 n_layers,
                # n_classes,
                 dropout_prob):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model,max_len,dropout_prob)
        transformer_blocks = [
            EncoderBlock(
                d_k,
                d_model,
                n_heads,
                max_len,
                dropout_prob) for _ in range(n_layers)]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
       # self.fc = nn.Linear(d_model,n_classes)

    def forward (self, x, pad_mask=None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(x,pad_mask)

       # x = x[:,0,:]

        x = self.ln(x)
        #x = self.fc(x)
        return x


In [None]:
class Decoder(nn.Module):
    def __init__(self,
                 vocab_size,
                 max_len,
                 d_k,
                 d_model,
                 n_heads,
                 n_layers,
                 dropout_prob):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model,max_len,dropout_prob)
        transformer_blocks = [
            DecoderBlock(
                d_k,
                d_model,
                n_heads,
                max_len,
                dropout_prob) for _ in range(n_layers)]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model,vocab_size)

    def forward (self,enc_output,dec_input,enc_mask=None,dec_mask=None):
        x = self.embedding(dec_input)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(enc_output,x,enc_mask,dec_mask)
        x = self.ln(x)
        x = self.fc(x)
        return x


In [None]:
class Transformer(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self,enc_input, dec_input, enc_mask, dec_mask):
        enc_output = self.encoder(enc_input,enc_mask)
        dec_output = self.decoder(enc_output,dec_input,enc_mask,dec_mask)
        return dec_output

In [None]:
encoder = Encoder(
    vocab_size=20_000,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)

decoder = Decoder(
    vocab_size=10_000,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)

transformer = Transformer(encoder,decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)


cuda:0


Decoder(
  (embedding): Embedding(10000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln3): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha1): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (mha2): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
 

In [None]:
xe = np.random.randint(0,20_000,size=(8,512))
xe_t = torch.tensor(xe).to(device)

xd = np.random.randint(0,10_000,size=(8,256))
xd_t = torch.tensor(xd).to(device)

maske = np.ones((8,512))
maske[:,256:] = 0
maske_t = torch.tensor(maske).to(device)

maskd = np.ones((8,256))
maskd[:,128:] = 0
maskd_t = torch.tensor(maskd).to(device)

out = transformer(xe_t,xd_t,maske_t,maskd_t)
out.shape


torch.Size([8, 256, 10000])

In [None]:
out

tensor([[[ 0.1596, -0.1098, -1.1206,  ..., -0.2964, -1.1945,  0.5427],
         [-0.1700, -0.8695, -0.4522,  ..., -0.2257, -0.6337, -0.0173],
         [ 0.1400,  0.3416, -0.2323,  ...,  0.6505,  0.1972,  0.6810],
         ...,
         [ 0.8121,  0.8171, -0.2462,  ..., -0.3110, -0.2752, -1.1337],
         [ 0.4589, -0.3363, -0.9042,  ..., -0.4517, -1.2919,  0.0859],
         [ 0.0511,  0.5722, -0.2062,  ...,  0.1535, -0.3495, -0.0922]],

        [[ 0.0532, -0.3018, -0.4368,  ..., -0.0891, -0.4766,  0.1321],
         [-0.3905, -0.1792, -0.1729,  ..., -0.5487, -1.3988, -0.1404],
         [ 0.8613,  0.2950, -0.2652,  ..., -0.2193, -0.9727, -0.0623],
         ...,
         [ 0.0950, -0.0636, -0.5539,  ..., -0.3589, -0.6688,  1.6543],
         [ 0.6173, -0.0767, -0.5181,  ...,  0.3933, -0.8661, -0.1873],
         [-0.3922,  0.9883, -0.5247,  ...,  0.6861, -1.0218,  1.0023]],

        [[-0.2075,  0.1944, -0.8772,  ..., -0.5914, -0.3050,  0.0079],
         [-0.2455,  0.5779,  0.0396,  ...,  0

In [None]:
import pandas as pd
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

(130476, 2)

In [None]:
df= df.iloc[:100000] #used full rows

In [None]:
#df = pd.read_csv(file_path)
df.to_csv('hien.csv',index=None)


In [None]:
#!head hien.csv

In [None]:
from datasets import load_dataset
raw_dataset = load_dataset('csv',data_files='hien.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 100000
    })
})

In [None]:
split = raw_dataset['train'].train_test_split(test_size=0.3,seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 70000
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 30000
    })
})

In [None]:
en_sentence = split["train"][0]["English"]
hi_sentence = split["train"][0]["Hindi"]

inputs = tokenizer(en_sentence)
targets = tokenizer(text_target=hi_sentence)

tokenizer.convert_ids_to_tokens(targets['input_ids'])

['▁इसके',
 '▁साथ',
 '▁ही',
 '▁उसे',
 '▁एक',
 '▁दंड',
 '▁धारण',
 '▁करने',
 '▁के',
 '▁लिए',
 '▁दिया',
 '▁जाता',
 '▁है',
 '▁और',
 '▁',
 '?',
 '▁',
 'दर्भ',
 '▁',
 '?',
 '▁नामक',
 '▁विशेष',
 '▁घास',
 '▁की',
 '▁एक',
 '▁मुद्',
 'रिका',
 '▁पहना',
 'ई',
 '▁जाती',
 '▁है',
 '▁जो',
 '▁वह',
 '▁अपने',
 '▁दाहिने',
 '▁हाथ',
 '▁की',
 '▁अनाम',
 'िका',
 '▁में',
 '▁पहनता',
 '▁है',
 '▁',
 '.',
 '</s>']

In [None]:
hi_sentence

'इसके साथ ही उसे एक दंड धारण करने के लिए दिया जाता है और ? दर्भ ? नामक विशेष घास की एक मुद्रिका पहनाई जाती है जो वह अपने दाहिने हाथ की अनामिका में पहनता है .'

In [None]:
max_input_length = 256
max_target_length = 256


In [None]:
def preprocess_function(batch):
    # Ensure that the input is in string format
    english_texts = [str(text) for text in batch['English']]
    hindi_texts = [str(text) for text in batch['Hindi']]

    # Tokenize the inputs
    model_inputs = tokenizer(
        english_texts,
        max_length=max_input_length,
        truncation=True,
        padding='max_length',  # Add padding to ensure all sequences have the same length
        return_tensors="pt"    # Return PyTorch tensors
    )

    # Tokenize the labels/targets
    labels = tokenizer(
        hindi_texts,
        max_length=max_target_length,
        truncation=True,
        padding='max_length',  # Add padding to ensure all sequences have the same length
        return_tensors="pt"    # Return PyTorch tensors
    )

    # Attach the labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
tokenized_datasets = split.map(
    preprocess_function,
    batched=True,
    remove_columns=split["train"].column_names,
)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 70000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
})

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer)

2024-04-13 17:17:24.058174: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 17:17:24.058320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 17:17:24.196716: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(0,5)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
batch['input_ids']

tensor([[ 8024,    44,     2,  ..., 61949, 61949, 61949],
        [  285,     4, 20338,  ..., 61949, 61949, 61949],
        [ 1089,  4718,    16,  ..., 61949, 61949, 61949],
        [   81,   319,  1199,  ..., 61949, 61949, 61949],
        [  211,   308,  2303,  ..., 61949, 61949, 61949]])

In [None]:
batch['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [None]:
batch['labels']

tensor([[   44,  7173,     1,  ..., 61949, 61949, 61949],
        [   44,   680,   428,  ..., 61949, 61949, 61949],
        [   44,  4499,  1400,  ..., 61949, 61949, 61949],
        [   44, 35004,   800,  ..., 61949, 61949, 61949],
        [   44,  3605,  2703,  ..., 61949, 61949, 61949]])

In [None]:
tokenizer.all_special_ids

[0, 1, 61949]

In [None]:
tokenizer.all_special_tokens

['</s>', '<unk>', '<pad>']

In [None]:
tokenizer('<pad>')

{'input_ids': [61949, 0], 'attention_mask': [1, 1]}

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

valid_loader = DataLoader(
    tokenized_datasets["test"],
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
for batch in train_loader:
    for k,v in batch.items():
        print("k: ",k,"v.shape: ",v.shape)
    break

k:  input_ids v.shape:  torch.Size([16, 256])
k:  attention_mask v.shape:  torch.Size([16, 256])
k:  labels v.shape:  torch.Size([16, 256])


In [None]:
tokenizer.vocab_size

61950

In [None]:
tokenizer.decode([0])

'</s>'

In [None]:
tokenizer.add_special_tokens({"cls_token": "<s>"})

1

In [None]:
tokenizer("<s>")

{'input_ids': [61950, 0], 'attention_mask': [1, 1]}

In [None]:
tokenizer.vocab_size

61950

In [None]:
encoder = Encoder(
    vocab_size=tokenizer.vocab_size + 1,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)

decoder = Decoder(
    vocab_size=tokenizer.vocab_size + 1,
    max_len=512,
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1)

transformer = Transformer(encoder,decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)


cuda:0


Decoder(
  (embedding): Embedding(61951, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln3): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha1): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (mha2): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
 

In [None]:
#Loass and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=61949)
optimizer = torch.optim.Adam(transformer.parameters())

In [None]:
from datetime import datetime
#A function to calculate the time taken

def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)

    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = []
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            optimizer.zero_grad()

            enc_input = batch['input_ids']
            enc_mask = batch['attention_mask']
            targets = batch['labels']

            dec_input = targets.clone().detach()
            dec_input = torch.roll(dec_input, shifts=1, dims=1)
            dec_input[:,0] = 61_950

            dec_input = dec_input.masked_fill(
                dec_input == 61_949, tokenizer.pad_token_id)

            dec_mask = torch.ones_like(dec_input)
            dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

            #forward
            outputs = model(enc_input, dec_input, enc_mask, dec_mask)
            loss = criterion(outputs.transpose(2,1), targets)

            #backward
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        train_loss = np.mean(train_loss)

        model.eval()
        test_loss = []
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            enc_input = batch['input_ids']
            enc_mask = batch['attention_mask']
            targets = batch['labels']
            dec_input = targets.clone().detach()
            dec_input = torch.roll(dec_input, shifts=1, dims=1)
            dec_input[:,0] = 61_950

            dec_input = dec_input.masked_fill(
                dec_input == 61_949, tokenizer.pad_token_id)

            dec_mask = torch.ones_like(dec_input)
            dec_mask = dec_mask.masked_fill(dec_input == tokenizer.pad_token_id, 0)

            outputs = model(enc_input, dec_input, enc_mask, dec_mask)
            loss = criterion(outputs.transpose(2,1),targets)
            test_loss.append(loss.item())
        test_loss = np.mean(test_loss)

        train_losses[it] = train_loss
        test_losses[it] = test_loss

        dt = datetime.now() - t0
        print(f'Epoch{it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
              Test Loss: {test_loss:.4f}, Duration: {dt}')
    return train_losses, test_losses




In [None]:
train_losses, test_losses = train(transformer, criterion, optimizer, train_loader, valid_loader, epochs=25)

Epoch1/25, Train Loss: 2.6164,               Test Loss: 2.2359, Duration: 0:22:21.558546
Epoch2/25, Train Loss: 2.2488,               Test Loss: 2.0521, Duration: 0:22:21.571902
Epoch3/25, Train Loss: 2.1266,               Test Loss: 1.9623, Duration: 0:22:21.909780
Epoch4/25, Train Loss: 2.0587,               Test Loss: 1.9052, Duration: 0:22:22.576786
Epoch5/25, Train Loss: 2.0135,               Test Loss: 1.8655, Duration: 0:22:20.690815
Epoch6/25, Train Loss: 1.9803,               Test Loss: 1.8378, Duration: 0:22:20.108449
Epoch7/25, Train Loss: 1.9536,               Test Loss: 1.8109, Duration: 0:22:21.293557
Epoch8/25, Train Loss: 1.9334,               Test Loss: 1.7905, Duration: 0:22:20.147807
Epoch9/25, Train Loss: 1.9151,               Test Loss: 1.7758, Duration: 0:22:19.849807
Epoch10/25, Train Loss: 1.9005,               Test Loss: 1.7626, Duration: 0:22:21.044682
Epoch11/25, Train Loss: 1.8871,               Test Loss: 1.7517, Duration: 0:22:20.962966
Epoch12/25, Train L

In [None]:
input_sentence = split['test'][10]['English']
input_sentence

In [None]:
enc_input = tokenizer(input_sentence, return_tensors='pt')
enc_input

{'input_ids': tensor([[  238,  1685, 17614, 10283,  2411,     4,  6227,     8, 48899, 20542,
           916,    23,   747, 48899, 19268,  6439,     3,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
dec_input_str = '<s>'

dec_input = tokenizer(text_target=dec_input_str, return_tensors='pt')
dec_input

{'input_ids': tensor([[61950,     0]]), 'attention_mask': tensor([[1, 1]])}

In [None]:
enc_input.to(device)
dec_input.to(device)
output = transformer(
    enc_input['input_ids'],
    dec_input['input_ids'][:, :-1],
    enc_input['attention_mask'],
    dec_input['attention_mask'][:, :-1],
)
output

tensor([[[ -1.8439,  -2.4967,  -3.4305,  ..., -14.9776, -15.0811, -15.0501]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
output.shape

torch.Size([1, 1, 61951])

In [None]:
enc_output = encoder(enc_input['input_ids'], enc_input['attention_mask'])
enc_output.shape

torch.Size([1, 18, 64])

In [None]:
dec_output = decoder(
    enc_output,
    dec_input['input_ids'][:,:-1],
    enc_input['attention_mask'],
    dec_input['attention_mask'][:, :-1]
    )
dec_output.shape

torch.Size([1, 1, 61951])

In [None]:
torch.allclose(output, dec_output)

True

In [None]:
dec_input_ids = dec_input['input_ids'][:, :-1]
dec_attn_mask = dec_input['attention_mask'][:, :-1]

for _ in range(32):
    dec_output = decoder(
        enc_output,
        dec_input_ids,
        enc_input['attention_mask'],
        dec_attn_mask,
    )

    prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)
    dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1,1)))

    dec_attn_mask = torch.ones_like(dec_input_ids)

    if prediction_id == 0:
        break

In [None]:
tokenizer.decode(dec_input_ids[0])

'<s> <unk> ्<unk> दो दो दो दो महान के <unk> ्<unk> का <unk>'

In [None]:
split['test'][10]['Hindi']

'ताज महल के दक्षिण में स्थित एक छोटी बस्ती को ताजगंज कहते हैं।'

In [None]:
def translate(input_sentence):
    #get encoder
    enc_input = tokenizer(input_sentence, return_tensors='pt').to(device)
    enc_output = encoder(enc_input['input_ids'], enc_input['attention_mask'])

    #setup
    dec_input_ids = torch.tensor([[61950]], device=device)
    dec_attn_mask = torch.ones_like(dec_input_ids, device=device)

    #decoder loop
    for _ in range(32):
        dec_output = decoder(
            enc_output,
            dec_input_ids,
            enc_input['attention_mask'],
            dec_attn_mask,
        )

        prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)

        dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1,1)))

        dec_attn_mask = torch.ones_like(dec_input_ids)

        if prediction_id == 0:
            break
    translation = tokenizer.decode(dec_input_ids[0,1:])
    print(translation)




In [None]:
translate("How are you")

तुम कैसे हैं</s>
