In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np

from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import sys
from indicnlp import common
from indicnlp.tokenize import indic_tokenize
import math

In [2]:
INDIC_NPL_LIB_HOME = r"indic_nlp_library"
INDIC_NLP_RESOURCES = r"indic_nlp_resources"

sys.path.append(r'{}/src'.format(INDIC_NPL_LIB_HOME))
common.set_resources_path(INDIC_NLP_RESOURCES)

In [3]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        """ Dividing word's embedding into 'H' different heads
            For ex: embed_size = 512 & heads = 8
            Then 8 heads of 64 size are created
        """
        assert (self.embed_size % self.heads == 0), "Embed size should be in multiple of heads"

        self.values = nn.Linear(in_features=self.embed_size, out_features=self.embed_size, bias=False)
        self.keys = nn.Linear(in_features=self.embed_size, out_features=self.embed_size, bias=False)
        self.queries = nn.Linear(in_features=self.embed_size, out_features=self.embed_size, bias=False)
        self.fc_out = nn.Linear(in_features=self.head_dim * self.heads, out_features=self.embed_size)

    def forward(self, values, keys, query, mask=None):
        N = query.shape[0]  # Number of training examples
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Splitting embeddings into 'H' heads for creating multi-head attention
        # V, K, Q reshape = num_samp, seq_len, heads, heads_dim

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Attention calculation
        # attention score = softmax(Q*t(K))/sqrt(Q.shape[-1])
        score = torch.einsum("nqhd,nkhd -> nhqk", queries, keys)
        """
        einsum explained: "nqhd,nkhd -> nhqk"
        1. nqhd -> nhqd : queries.transpose(-2,-3) & nkhd -> nhkd : keys.transpose(-2,-3)
        2. nhqk : (torch.bmm(nhqd.reshape(n*h,q,d), nhkd.reshape(n*h,k,d).transpose(-1,-2)).reshape(n,h,q,k)  
        """

        if mask is not None:
            """
            Masking is very critical for implementing decoder side self attention
            Since in decoding side we want to have attention scores with previous time steps elements only
            So for this we use upper triangular masked matrix 
            """
            score = score.masked_fill(mask == 0, float('-1e20'))
        attention_score = torch.softmax(score / math.sqrt(self.head_dim), dim=-1)  # N, heads, query_len, key_len
        out = torch.einsum("nhql,nlhd->nqhd", attention_score, values).reshape(N, query_len, self.heads*self.head_dim)
        # out.shape >> N, query_len, embed_size

        out = self.fc_out(out)
        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size=embed_size, heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)  # Normalization for each example for each embed dim across seq_len
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_fwd = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask=None):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))  # Layernorm1 + Skip connection
        forward = self.feed_fwd(x)
        out = self.dropout(self.norm2(forward + x))  # Layernorm2 + Skip connection
        return out


class PositionalEncoding(nn.Module):
    """
    PE(pos,2i) = sin(pos/10000^(2i/emb_size))
    PE(cos,2i+1) = cos(pos/10000^(2i+1/emb_size))
    """
    def __init__(self, max_len, embed_size, dropout, device):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len).unsqueeze(1)  # column data : [max_len, 1]
        div_term = torch.exp(torch.arange(0, embed_size, 2) * -(math.log(10000.0) / embed_size))
        pe[:, 0::2] = torch.sin(position * div_term)  # even place in emb_dim get sin wavelength
        pe[:, 1::2] = torch.cos(position * div_term)  # odd place in emb_dim get cos wavelength
        pe = pe.unsqueeze(0).to(device)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size=embed_size,
                                                  heads=heads,
                                                  dropout=dropout,
                                                  forward_expansion=forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, trg_mask, src_mask=None):
        attention = self.attention(x, x, x, mask=trg_mask)
        query = self.dropout(self.norm(attention + x))  # LayerNorm + Skip connection
        out = self.transformer_block(value=value, key=key, query=query, mask=src_mask)
        return out

In [4]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_len):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = PositionalEncoding(max_len, embed_size, dropout, device=self.device)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size=embed_size,
                                 heads=heads,
                                 dropout=dropout,
                                 forward_expansion=forward_expansion)
                for _ in range(num_layers)
            ]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.word_embedding(x)
        out = self.position_embedding(x)

        for layer in self.layers:
            out = layer(value=out, key=out, query=out, mask=mask)
        return out


class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_len):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = PositionalEncoding(max_len, embed_size, dropout, device=self.device)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size=embed_size,
                             heads=heads,
                             dropout=dropout,
                             forward_expansion=forward_expansion,
                             device=self.device)
                for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(embed_size, trg_vocab_size)

    def forward(self, x, enc_out, trg_mask, src_mask=None):
        x = self.word_embedding(x)
        x = self.position_embedding(x)

        """
        In decoder part key & value comes from the encoder output 
        while query comes from the self attention layer's output of the decoder         
        """
        for layer in self.layers:
            x = layer(x=x, value=enc_out, key=enc_out, trg_mask=trg_mask, src_mask=src_mask)

        dec_out = self.fc_out(x)
        return dec_out

In [5]:

class Transformer(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 src_pad_idx,
                 trg_pad_idx,
                 embed_size,
                 num_layers,
                 forward_expansion,
                 heads,
                 dropout,
                 device="cuda",
                 max_len=500):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size=src_vocab_size,
                               embed_size=embed_size,
                               num_layers=num_layers,
                               heads=heads,
                               device=device,
                               forward_expansion=forward_expansion,
                               dropout=dropout,
                               max_len=max_len)

        self.decoder = Decoder(trg_vocab_size=trg_vocab_size,
                               embed_size=embed_size,
                               num_layers=num_layers,
                               heads=heads,
                               device=device,
                               forward_expansion=forward_expansion,
                               dropout=dropout,
                               max_len=max_len)

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len)))
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src=src)
        trg_mask = self.make_trg_mask(trg=trg)
        enc_src = self.encoder(x=src, mask=src_mask)
        out = self.decoder(x=trg, enc_out=enc_src, trg_mask=trg_mask, src_mask=src_mask)
        return out

In [6]:
spacy_eng = spacy.load("en_core_web_sm")

In [7]:
def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

def tokenize_hindi(text):
    return [tok for tok in indic_tokenize.trivial_tokenize(text)]

In [8]:
#defin field
english_txt = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")
hindi_txt = Field(tokenize=tokenize_hindi, init_token="<sos>", eos_token="<eos>")

In [9]:
data_fields = [('eng_text', english_txt), ('hindi_text', hindi_txt)]
train_dt, val_dt = TabularDataset.splits(path='./', train='train_sm.csv', validation='val_sm.csv', format='csv', fields=data_fields)

In [10]:
english_txt.build_vocab(train_dt, max_size=10000, min_freq = 2)
hindi_txt.build_vocab(train_dt, max_size=10000, min_freq=2)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
save_model = True

In [12]:
num_epochs = 10
learning_rate = 3e-4
batch_size = 64


In [13]:
# Defining Iterator
train_iter = BucketIterator(train_dt, batch_size=batch_size, sort_key=lambda x: len(x.eng_text), shuffle=True)
val_iter = BucketIterator(val_dt, batch_size=batch_size, sort_key=lambda x: len(x.eng_text), shuffle=True)

# Model hyper-parameters
src_vocab_size = len(english_txt.vocab)
trg_vocab_size = len(hindi_txt.vocab)
embedding_size = 512
num_heads = 8
num_layers = 3
dropout = 0.10
max_len = 10000
forward_expansion = 4
src_pad_idx = english_txt.vocab.stoi["<pad>"]
trg_pad_idx = 0


In [14]:
# Defining model & optimizer attributes
model = Transformer(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    embed_size=embedding_size,
                    num_layers=num_layers,
                    forward_expansion=forward_expansion,
                    heads=num_heads,
                    dropout=dropout,
                    device=device,
                    max_len=max_len).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True)

pad_idx = hindi_txt.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
loss_tracker = []



In [15]:

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [16]:
for epoch in range(num_epochs):
    model.train()
    losses = []
    loop = tqdm(enumerate(train_iter), total=len(train_iter))
    for batch_idx, batch in loop:
        # Get input and targets and move to GPU if available
        # Switching axis because bucket-iterator gives output of size(seq_len,bs)
        inp_data = batch.eng_text.permute(-1, -2).to(device)
        target = batch.hindi_text.permute(-1, -2).to(device)

        # Forward prop
        output = model(inp_data, target[:, :-1])

        optimizer.zero_grad()
        loss = criterion(output.reshape(-1, trg_vocab_size), target[:, 1:].reshape(-1))
        losses.append(loss.item())

        # Checking GPU uses
        if device.type == "cuda":
            total_mem = torch.cuda.get_device_properties(0).total_memory/1024/1024
            allocated_mem = torch.cuda.memory_allocated(0)/1024/1024
            reserved_mem = torch.cuda.memory_reserved(0)/1024/1024
        else:
            total_mem = 0
            allocated_mem = 0
            reserved_mem = 0

        # Back prop
        loss.backward()

        # Clipping exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Update progress bar
        loop.set_postfix(loss=loss.item(), total_gpu_mem=str(total_mem), gpu_allocated_mem=str(allocated_mem), gpu_reserved_mem=str(reserved_mem))

    train_mean_loss = sum(losses) / len(losses)
    scheduler.step(train_mean_loss)

    model.eval()
    val_losses = []
    with torch.no_grad():
        for val_batch_idx, val_batch in tqdm(enumerate(val_iter), total=len(val_iter)):
            val_inp_data = val_batch.eng_text.permute(-1, -2).to(device)
            val_target = val_batch.hindi_text.permute(-1, -2).to(device)
            val_output = model(val_inp_data, val_target[:, :-1])
            val_loss = criterion(val_output.reshape(-1, trg_vocab_size), val_target[:, 1:].reshape(-1))
            val_losses.append(val_loss.item())
        val_mean_loss = sum(val_losses)/len(val_losses)

    loss_tracker.append(val_mean_loss)

    if epoch % 1 == 0:
        if save_model and val_mean_loss == np.min(loss_tracker):
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint)

    print(f"Epoch [{epoch + 1}/{num_epochs}]: train_loss= {train_mean_loss}; val_loss= {val_mean_loss}")

  0%|          | 0/1641 [00:00<?, ?it/s]

  3%|▎         | 44/1641 [00:05<03:24,  7.82it/s, gpu_allocated_mem=759.47705078125, gpu_reserved_mem=1280.0, loss=6.19, total_gpu_mem=3720.9375]


KeyboardInterrupt: 

# Loading model 

In [14]:
import torch

def load_model_for_inference(model_class, model_path, device, **model_params):
    """
    Load a saved PyTorch model for inference.

    Args:
    model_class (torch.nn.Module): The class of the model to be loaded.
    model_path (str): Path to the saved checkpoint file.
    device (torch.device): The device to load the model onto (CPU or GPU).
    **model_params: Additional parameters required to initialize the model.

    Returns:
    torch.nn.Module: The loaded model set to evaluation mode.
    """
    # Initialize the model
    model = model_class(**model_params).to(device)

    # Load the checkpoint
    checkpoint = torch.load(model_path, map_location=device)

    # Load the state dict into the model
    model.load_state_dict(checkpoint['state_dict'])

    # Set the model to evaluation mode
    model.eval()

    return model

# Example usage
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = load_model_for_inference(
        model_class=Transformer,
        model_path='my_checkpoint.pth.tar',
        device=device,
        src_vocab_size=src_vocab_size,
        trg_vocab_size=trg_vocab_size,
        src_pad_idx=src_pad_idx,
        trg_pad_idx=trg_pad_idx,
        embed_size=embedding_size,
        num_layers=num_layers,
        forward_expansion=forward_expansion,
        heads=num_heads,
        dropout=dropout,
        max_len=max_len
    )

    print("Model loaded successfully and ready for inference!")

Model loaded successfully and ready for inference!


In [15]:
import torch
import copy

def beam_search(sentence, model, src_field, src_tokenizer, trg_field, trg_vcb_sz, k, max_ts=50, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
    # Ensure model is on the correct device
    model = model.to(device)

    # Tokenize the input sentence
    sentence_tok = src_tokenizer(sentence)

    # Add <sos> and <eos> in beginning and end respectively
    sentence_tok.insert(0, src_field.init_token)
    sentence_tok.append(src_field.eos_token)

    # Converting text to indices
    src_tok = torch.tensor([src_field.vocab.stoi[token] for token in sentence_tok], dtype=torch.long).unsqueeze(0).to(device)
    trg_tok = torch.tensor([trg_field.vocab.stoi[trg_field.init_token]], dtype=torch.long).unsqueeze(0).to(device)

    # Setting 'eos' flag for target sentence
    eos = trg_field.vocab.stoi[trg_field.eos_token]

    # Store for top 'k' translations
    trans_store = {}

    store_seq_id = None
    store_seq_prob = None
    for ts in range(max_ts):
        if ts == 0:
            with torch.no_grad():
                out = model(src_tok, trg_tok)  # [1, trg_vcb_sz]
            topk = torch.topk(torch.log(torch.softmax(out, dim=-1)), dim=-1, k=k)
            seq_id = torch.empty(size=(k, ts + 2), dtype=torch.long, device=device)
            seq_id[:, :ts + 1] = trg_tok
            seq_id[:, ts + 1] = topk.indices
            seq_prob = topk.values.squeeze()
            
            if eos in seq_id[:, ts + 1]:
                trans_store[seq_prob[seq_id[:, ts + 1] == eos].squeeze().item()] = seq_id[seq_id[:, ts + 1] == eos, :].squeeze()
                store_seq_id = seq_id[seq_id[:, ts + 1] != eos, :].clone().to(device)
                store_seq_prob = seq_prob[seq_id[:, ts + 1] != eos].clone().to(device)
            else:
                store_seq_id = seq_id.clone().to(device)
                store_seq_prob = seq_prob.clone().to(device)
        else:
            src_tok = src_tok.squeeze()
            src = src_tok.expand(size=(store_seq_id.shape[-2], len(src_tok))).to(device)
            with torch.no_grad():
                out = model(src, store_seq_id)
            out = torch.log(torch.softmax(out[:, -1, :], dim=-1))  # [k, trg_vcb_sz]
            all_comb = (store_seq_prob.view(-1, 1) + out).view(-1)
            all_comb_idx = torch.tensor([(x, y) for x in range(store_seq_id.shape[-2]) for y in range(trg_vcb_sz)], device=device)
            topk = torch.topk(all_comb, dim=-1, k=k)
            top_seq_id = all_comb_idx[topk.indices.squeeze()]
            top_seq_prob = topk.values
            seq_id = torch.empty(size=(k, ts + 2), dtype=torch.long, device=device)
            seq_id[:, :ts + 1] = torch.tensor([store_seq_id[i.tolist()].tolist() for i, y in top_seq_id], device=device)
            seq_id[:, ts + 1] = torch.tensor([y.tolist() for i, y in top_seq_id], device=device)
            seq_prob = top_seq_prob
            
            if eos in seq_id[:, ts + 1]:
                for i, prob in enumerate(seq_prob[seq_id[:, ts + 1] == eos]):
                    trans_store[prob.item()] = seq_id[seq_id[:, ts + 1] == eos][i].cpu()
                store_seq_id = seq_id[seq_id[:, ts + 1] != eos, :].clone().to(device)
                store_seq_prob = seq_prob[seq_id[:, ts + 1] != eos].clone().to(device)
            else:
                store_seq_id = seq_id.clone().to(device)
                store_seq_prob = seq_prob.clone().to(device)
        
        if len(trans_store) == k:
            break

    if len(trans_store) == 0:
        best_translation = store_seq_id[0].cpu()
    else:
        best_translation = trans_store[max(trans_store)]
    
    return " ".join([trg_field.vocab.itos[w] for w in best_translation[1:]])

In [16]:
sentence = "How are you?"
src_field = english_txt
src_tokenizer = tokenize_eng
trg_field = hindi_txt
trg_vcb_sz = 10000

tr = beam_search(sentence=sentence, model=model, src_field=src_field,src_tokenizer=src_tokenizer, trg_field=hindi_txt, trg_vcb_sz=trg_vcb_sz, k=5)
print(tr)

आप कैसे हो ? <eos>


In [62]:
import math
from collections import Counter
import re

def tokenize(text, language='en'):
    if isinstance(text, list):
        return text  # Already tokenized
    if language == 'en':
        # Simple English tokenization
        return text.lower().split()
    elif language == 'hi':
        # Simple Hindi tokenization
        return re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    else:
        # Default to simple whitespace tokenization
        return text.split()

def calculate_bleu(reference, candidate, max_n=4, language='en'):
    """
    Calculate BLEU score for a single sentence.
    
    :param reference: Reference translation (string or list of tokens)
    :param candidate: Candidate translation (string or list of tokens)
    :param max_n: Maximum n-gram size to consider (default: 4)
    :param language: Language of the translations ('en' for English, 'hi' for Hindi, etc.)
    :return: BLEU score
    """
    reference_tokens = tokenize(reference, language)
    candidate_tokens = tokenize(candidate, language)

    def ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

    if len(candidate_tokens) == 0:
        return 0

    candidate_len = len(candidate_tokens)
    reference_len = len(reference_tokens)

    # Calculate brevity penalty
    if candidate_len > reference_len:
        bp = 1
    else:
        bp = math.exp(1 - reference_len / candidate_len)

    # Calculate n-gram precisions
    precisions = []
    for n in range(1, min(max_n, candidate_len) + 1):
        candidate_ngrams = Counter(ngrams(candidate_tokens, n))
        reference_ngrams = Counter(ngrams(reference_tokens, n))
        
        matches = sum(min(candidate_ngrams[ngram], reference_ngrams[ngram]) for ngram in candidate_ngrams)
        total = sum(candidate_ngrams.values())
        
        if total > 0:
            precisions.append(matches / total)
        else:
            precisions.append(0)

    # Calculate geometric mean of precisions
    if all(p > 0 for p in precisions):
        geo_mean = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
    else:
        geo_mean = 0

    return bp * geo_mean

def calculate_corpus_bleu(references, candidates, language='en'):
    """
    Calculate corpus-level BLEU score for multiple sentences.
    
    :param references: List of reference translations (strings or lists of tokens)
    :param candidates: List of candidate translations (strings or lists of tokens)
    :param language: Language of the translations
    :return: Corpus-level BLEU score
    """
    total_score = 0
    for ref, cand in zip(references, candidates):
        total_score += calculate_bleu(ref, cand, language=language)
    return total_score / len(references)

In [77]:
def translate_and_evaluate(sentence, reference, model, src_field, src_tokenizer, trg_field, trg_vcb_sz, k, max_ts=50, target_language='hi'):
    # Perform beam search
    translation = beam_search(sentence, model, src_field, src_tokenizer, trg_field, trg_vcb_sz, k, max_ts)
    
    # Remove <eos> token if present
    translation = translation.replace(" <eos>", "").strip()
    
    # Calculate BLEU score
    bleu_score = calculate_bleu(reference, translation, language=target_language)
    
    return translation, bleu_score

# Example usage
sentence = "I love to eat pizza."
reference = "मुझे पिज्जा खाना पसंद है।"  # Hindi reference
translation, bleu_score = translate_and_evaluate(sentence, reference, model, src_field, src_tokenizer, trg_field, trg_vcb_sz, k=5, target_language='hi')
print(f"Input: {sentence}")
print(f"Translation: {translation}")
print(f"Reference: {reference}")
print(f"BLEU score: {bleu_score}")

Input: I love to eat pizza.
Translation: मुझे <unk> फोन हमारा पसंद है । .
Reference: मुझे पिज्जा खाना पसंद है।
BLEU score: 0.4109080290971358
