In [179]:

# NOTE: If you are running this notebook on Google Colab,
#       then uncomment the two lines below and then run this cell!

!pip install datasets evaluate --upgrade -q
!python -m spacy download de_core_news_sm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [180]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

In [181]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Load Dataset in correct format

In [182]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [183]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Hindi_English_Truncated_Corpus 1.csv", index_col = False)


  df = pd.read_csv("/content/drive/MyDrive/Hindi_English_Truncated_Corpus 1.csv", index_col = False)


In [184]:
df = df[['english_sentence','hindi_sentence']]

In [185]:
if df['english_sentence'].dtype == float:
  df = df.dropna(subset=[['english_sentence', 'hindi_sentence']])

In [186]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [187]:
data = df.rename(columns={'english_sentence':'en', 'hindi_sentence':'hi'})
data.head()

Unnamed: 0,en,hi
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [188]:
from datasets import Dataset, DatasetDict, Features

In [189]:
data.reset_index(drop = True,inplace = True)

In [190]:
data = data.dropna()

In [191]:
data = data[:10000]

In [192]:
from sklearn.model_selection import train_test_split
# Split the data into training, testing, and validation sets
train_data, test_data = train_test_split(data, test_size=0.2)
train_data, val_data = train_test_split(train_data, test_size=0.1)


In [193]:
# Create Datasets from the pandas DataFrames
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
val_dataset = Dataset.from_pandas(val_data)

In [194]:
# Create a DatasetDict with train, validation, and test splits
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [195]:
# Print the dataset dictionary
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['en', 'hi', '__index_level_0__'],
        num_rows: 7200
    })
    validation: Dataset({
        features: ['en', 'hi', '__index_level_0__'],
        num_rows: 800
    })
    test: Dataset({
        features: ['en', 'hi', '__index_level_0__'],
        num_rows: 2000
    })
})


In [196]:
train_data, valid_data, test_data = (
    dataset_dict["train"],
    dataset_dict["validation"],
    dataset_dict["test"],
)

In [197]:
train_data[0]

{'en': 'Much of the glucose stays in the bloodstream , rather than being metabolised or stored , and the body does not get all the energy that it should .',
 'hi': 'इसके पलसस्वरूप अधिक ग़्लूकोज कोशिकाओं द्वारा इस्तेमाल होने या संग्रहित होने की बजाय रक़्त में ही रहता है तथा शरीर को उतनी उर्जा नहीं मिल पाती जितनी उसे मिलनी चाहिए .',
 '__index_level_0__': 713}

## Store Data

In [198]:
# Open a text file for writing
with open('english.txt', 'w') as file:
    # Iterate over the values in the specified column
    for value in data['en']:
        # Write each value followed by a newline character
        file.write(str(value) + '\n')

In [199]:
# Open a text file for writing
with open('hindi.txt', 'w') as file:
    # Iterate over the values in the specified column
    for value in data['hi']:
        # Write each value followed by a newline character
        file.write(str(value) + '\n')

In [200]:
# Open the text file for reading
with open('english.txt', 'r') as file:
    # Read all lines into a list
    lines = file.readlines()

# Print the contents of the file
i = 0
for line in lines:
    print(line.strip())  # .strip() removes trailing newline characters
    i += 1
    if i == 5:
      break

politicians do not have permission to do what needs to be done.
I'd like to tell you about one such child,
This percentage is even greater than the percentage in India.
what we really mean is that they're bad at not paying attention.
.The ending portion of these Vedas is called Upanishad.


In [201]:
with open('hindi.txt', 'r') as file:
    # Read all lines into a list
    lines = file.readlines()

# Print the contents of the file
i = 0
for line in lines:
    print(line.strip())  # .strip() removes trailing newline characters
    i += 1
    if i == 5:
      break

राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .
मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,
यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [202]:
# Initialize an empty string to store the file contents
input_lang = ''

# Open the text file for reading
with open('english.txt', 'r') as file:
    # Read the entire file contents and store them in input_lang
    input_lang = file.readlines()

# Now, input_lang contains the contents of the file
print(len(input_lang))

10000


## Tokenize Data

In [203]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [204]:
# Initialize a trainer
trainer = BpeTrainer(special_tokens=[
    "<PAD>",
    "<SOS>",
    "<EOS>",
    "<UNK>",
    "<BOS>",
])

In [205]:
# Initialize a tokenizer
tokenizer_eng = Tokenizer(BPE())
tokenizer_hin = Tokenizer(BPE())
# Initialize a pre-tokenizer
tokenizer_eng.pre_tokenizer = Whitespace()
tokenizer_hin.pre_tokenizer = Whitespace()

In [206]:
# Initialize a trainer
trainer = BpeTrainer(special_tokens=[
    "<PAD>",
    "<SOS>",
    "<EOS>",
    "<UNK>",
    "<BOS>",

])

In [207]:
# Training files
eng_text = ["english.txt"]
hin_text = ["hindi.txt"]
tokenizer_eng.train(eng_text, trainer)
tokenizer_hin.train(hin_text, trainer)

In [208]:
eng_vocab = tokenizer_eng.get_vocab()
hin_vocab = tokenizer_hin.get_vocab()

In [209]:
def tokenize(lang, tokenizer):
    lang_tokenized = []
    for sentence in lang:
        encoded = tokenizer.encode(sentence)
        lang_tokenized.append(encoded.ids)
    return lang_tokenized

In [210]:
test_data[0]['en']

'Where Lord Shiv explaining this story to Parvati there was a nest of crow and a crow inthat nest was also listening the story.'

In [211]:
train_data[0]['en']

'Much of the glucose stays in the bloodstream , rather than being metabolised or stored , and the body does not get all the energy that it should .'

In [212]:
def tokenize_example(example, tokenizer_eng, tokenizer_hin, max_length, lower, sos_token, eos_token):
    #print(example['hi'])
    if example['hi'] is None:
      return {"en_tokens": "", "hi_tokens": ""}
    en_tokens = tokenizer_eng.encode(example['en']).tokens
    if lower:
      en_tokens = [token.lower() for token in en_tokens]
    hi_tokens = tokenizer_hin.encode(example["hi"]).tokens

    en_tokens = [sos_token] + en_tokens + [eos_token]
    hi_tokens = [sos_token] + hi_tokens + [eos_token]
    return {"en_tokens": en_tokens, "hi_tokens": hi_tokens}

In [213]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "tokenizer_eng": tokenizer_eng,
    "tokenizer_hin": tokenizer_hin,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token
}
train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Creating Vocab

In [214]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

In [215]:
en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq = min_freq,
    specials = special_tokens
)

In [216]:
hi_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["hi_tokens"],
    min_freq = min_freq,
    specials = special_tokens
)

In [217]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'the', '.', ',', 'of', 'and', 'to']

In [218]:
hi_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'के', '.', 'है', 'में', ',', 'की']

In [219]:
en_vocab.get_stoi()['the']

4

In [220]:
len(en_vocab),len(hi_vocab)

(6822, 7839)

In [221]:
"The" in en_vocab

False

In [222]:
unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [223]:
en_vocab.set_default_index(unk_index)
hi_vocab.set_default_index(unk_index)

In [224]:
en_vocab["The"]

0

In [225]:
en_vocab.get_itos()[0]

'<unk>'

In [226]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', 'crime', 'shows']

In [227]:
def numericalize_example(example, en_vocab, hi_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    hi_ids = hi_vocab.lookup_indices(example["hi_tokens"])
    return {"en_ids": en_ids, "hi_ids": hi_ids}

In [228]:
fn_kwargs = {"en_vocab": en_vocab, "hi_vocab": hi_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [229]:
data_type = "torch"
format_columns = ['en_ids','hi_ids']

train_data = train_data.with_format(
    type = data_type, columns = format_columns, output_all_columns = True
)
valid_data = valid_data.with_format(
    type = data_type, columns = format_columns, output_all_columns = True
)
test_data = test_data.with_format(
    type = data_type, columns = format_columns, output_all_columns = True
)

In [230]:
type(train_data[0]['en_ids'])

torch.Tensor

## Creating Dataloader

In [231]:
def get_collate_fn(pad_index): # Closure Function
  def collate_fn(batch):
    batch_en_ids = [example['en_ids'] for example in batch]
    batch_hi_ids = [example['hi_ids'] for example in batch]
    batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
    batch_hi_ids = nn.utils.rnn.pad_sequence(batch_hi_ids, padding_value=pad_index)
    batch = {
        'en_ids': batch_en_ids,
        'hi_ids': batch_hi_ids
    }
    return batch
  return collate_fn

In [232]:
def get_data_loader(dataset, batch_size, pad_index, shuffle = False):
  collate_fn = get_collate_fn(pad_index)
  data_loader = torch.utils.data.DataLoader(
      dataset = dataset,
      batch_size = batch_size,
      collate_fn = collate_fn,
      shuffle = shuffle
  )
  return data_loader

In [233]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

## Building Model

In [234]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, p):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.embedding = nn.Embedding(input_dim,embed_dim)
    self.rnn = nn.LSTM(embed_dim,hidden_dim,n_layers,dropout = p)
    self.dropout = nn.Dropout(p)

  def forward(self,src):
    # src = [src,batch_size]
    embedded = self.dropout(self.embedding(src))
    outputs, (hidden, cell) = self.rnn(embedded)
    return hidden, cell

In [235]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, p):
    super().__init__()
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.embedding = nn.Embedding(output_dim, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout = p)
    self.fc_out = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(p)

  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    prediction = self.fc_out(output.squeeze(0))

    return prediction, hidden, cell

In [236]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [237]:
input_dim = len(en_vocab)
output_dim = len(hi_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

## Training Data

In [238]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(6822, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7839, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=7839, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [239]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 15,131,039 trainable parameters


In [240]:
optimizer = optim.Adam(model.parameters())

In [241]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [242]:

def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["en_ids"].to(device)
        trg = batch["hi_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [243]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["en_ids"].to(device)
            trg = batch["hi_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [244]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5
best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [01:25<12:52, 85.86s/it]

	Train Loss:   6.712 | Train PPL: 822.377
	Valid Loss:   6.003 | Valid PPL: 404.805


 20%|██        | 2/10 [02:52<11:31, 86.50s/it]

	Train Loss:   6.356 | Train PPL: 575.725
	Valid Loss:   5.985 | Valid PPL: 397.453


 30%|███       | 3/10 [04:13<09:47, 83.95s/it]

	Train Loss:   6.236 | Train PPL: 510.945
	Valid Loss:   6.004 | Valid PPL: 404.946


 40%|████      | 4/10 [05:36<08:21, 83.51s/it]

	Train Loss:   6.145 | Train PPL: 466.581
	Valid Loss:   6.027 | Valid PPL: 414.515


 50%|█████     | 5/10 [07:01<06:59, 83.85s/it]

	Train Loss:   6.042 | Train PPL: 420.592
	Valid Loss:   5.989 | Valid PPL: 399.073


 60%|██████    | 6/10 [08:23<05:33, 83.30s/it]

	Train Loss:   5.977 | Train PPL: 394.284
	Valid Loss:   5.989 | Valid PPL: 398.868


 70%|███████   | 7/10 [09:48<04:11, 83.80s/it]

	Train Loss:   5.903 | Train PPL: 366.006
	Valid Loss:   5.997 | Valid PPL: 402.223


 80%|████████  | 8/10 [11:11<02:47, 83.81s/it]

	Train Loss:   5.836 | Train PPL: 342.313
	Valid Loss:   6.017 | Valid PPL: 410.266


 90%|█████████ | 9/10 [12:29<01:21, 81.87s/it]

	Train Loss:   5.792 | Train PPL: 327.652
	Valid Loss:   6.094 | Valid PPL: 443.031


100%|██████████| 10/10 [13:52<00:00, 83.26s/it]

	Train Loss:   5.764 | Train PPL: 318.626
	Valid Loss:   6.061 | Valid PPL: 428.892





## Evaluate Model

In [245]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 6.038 | Test PPL: 418.900 |


In [246]:
def translate_sentence(
    sentence,
    model,
    tokenizer_hin,
    tokenizer_eng,
    hi_vocab,
    en_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens =  tokenizer_eng.encode(sentence).tokens
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = en_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = hi_vocab.lookup_indices([sos_token])

        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)

            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == hi_vocab[eos_token]:
                break

        tokens = hi_vocab.lookup_tokens(inputs)
    return tokens

In [247]:
sentence = "A man is watching a film"
translation = translate_sentence(
    sentence,
    model,
    tokenizer_hin,
    tokenizer_eng,
    hi_vocab,
    en_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)