In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install dependencies
!pip install bert-score seaborn

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
# Imports
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import string
import collections
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, get_linear_schedule_with_warmup
from bert_score import score



In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Common Parameters
params = {
    'seed': 42,
    'max_length': 512,
    'batch_size': 2,
    'epochs': 20,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'dropout_rate': 0.3,
    'train_split': 0.8,
    'data_path': "/content/drive/MyDrive/Final_FYP_Implementations/Datasets and EDA/TaQuAD_final.json"
}

# Model names
model_names = {
    'muril': "google/muril-base-cased",
    'indicbert': "ai4bharat/indic-bert",
    'mt5': "google/mt5-small"
}


In [None]:
# Set random seeds
torch.manual_seed(params['seed'])
np.random.seed(params['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(params['seed'])

In [None]:
# Load the Tamil QA dataset
with open(params['data_path'], 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
# Prepare examples list to be used later by each tokenizer
raw_examples = []
for entry in data["data"]:
    for paragraph in entry["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            if qa["is_impossible"]:
                continue
            question = qa["question"]
            for answer in qa["answers"]:
                raw_examples.append({
                    "question": question,
                    "context": context,
                    "answer_start": answer["answer_start"],
                    "answer_text": answer["text"],
                    "domain": answer.get("domain", "unknown")
                })

print(f"Total raw examples loaded: {len(raw_examples)}")

Total raw examples loaded: 776


In [None]:
# Split dataset once
total_size = len(raw_examples)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

from sklearn.model_selection import train_test_split

train_val_data, test_data = train_test_split(raw_examples, test_size=test_size, random_state=params['seed'])
train_data, val_data = train_test_split(train_val_data, test_size=val_size / (val_size + train_size), random_state=params['seed'])

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 620, Val: 77, Test: 79


In [None]:
# Dataset Class
class QADataset(torch.utils.data.Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.encodings = tokenizer(
            [ex['question'] for ex in examples],
            [ex['context'] for ex in examples],
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )
        self.start_positions = []
        self.end_positions = []

        for i, ex in enumerate(examples):
            offset = self.encodings["offset_mapping"][i]
            start_char = ex["answer_start"]
            end_char = start_char + len(ex["answer_text"])

            input_ids = self.encodings["input_ids"][i]
            context = ex["context"]
            found_start = found_end = False

            for idx, (start, end) in enumerate(offset):
                if start <= start_char < end:
                    start_pos = idx
                    found_start = True
                if start < end_char <= end:
                    end_pos = idx
                    found_end = True
                    break

            if found_start and found_end:
                self.start_positions.append(start_pos)
                self.end_positions.append(end_pos)
            else:
                self.start_positions.append(0)
                self.end_positions.append(0)

        self.encodings.pop("offset_mapping")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["start_positions"] = torch.tensor(self.start_positions[idx])
        item["end_positions"] = torch.tensor(self.end_positions[idx])
        return item

    def __len__(self):
        return len(self.start_positions)

## MuRIL

In [None]:
# Load MuRIL tokenizer and model
muril_tokenizer = AutoTokenizer.from_pretrained(model_names['muril'])
muril_model = AutoModelForQuestionAnswering.from_pretrained(model_names['muril']).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Prepare Datasets and Dataloaders
muril_train_dataset = QADataset(train_data, muril_tokenizer, params['max_length'])
muril_val_dataset = QADataset(val_data, muril_tokenizer, params['max_length'])
muril_test_dataset = QADataset(test_data, muril_tokenizer, params['max_length'])

muril_train_loader = DataLoader(muril_train_dataset, batch_size=params['batch_size'], shuffle=True)
muril_val_loader = DataLoader(muril_val_dataset, batch_size=params['batch_size'])
muril_test_loader = DataLoader(muril_test_dataset, batch_size=1)

In [None]:
# Optimizer and Scheduler
muril_optimizer = AdamW(muril_model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
total_steps = len(muril_train_loader) * params['epochs']
muril_scheduler = get_linear_schedule_with_warmup(muril_optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
import os

# Define the path for saving the model
model_save_path = '/content/drive/MyDrive/Tamil-Question-Answering-System/MuRIL/same_split_model'

# Ensure the directory exists
os.makedirs(model_save_path, exist_ok=True)

# Initialize variables
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 2
patience_counter = 0

for epoch in range(params['epochs']):
    print(f"\nEpoch {epoch + 1}/{params['epochs']}")

    # ---- TRAINING PHASE ----
    muril_model.train()
    total_train_loss = 0
    start_time = time.time()

    train_loop = tqdm(muril_train_loader, desc="Training", leave=True)
    for batch in train_loop:
        muril_optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = muril_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        muril_optimizer.step()
        muril_scheduler.step()

    avg_train_loss = total_train_loss / len(muril_train_loader)
    train_losses.append(avg_train_loss)

    # ---- VALIDATION PHASE ----
    muril_model.eval()
    total_val_loss = 0

    val_loop = tqdm(muril_val_loader, desc="Validation", leave=True)
    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = muril_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(muril_val_loader)
    val_losses.append(avg_val_loss)

    # Print losses in the requested format
    print(f"Train_loss = {avg_train_loss:.3f} | Val_loss = {avg_val_loss:.3f}")

    # ---- EARLY STOPPING ----
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save the model inside the defined directory with the correct filename
        torch.save(muril_model.state_dict(), os.path.join(model_save_path, 'same_split_muril_model.pth'))
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break



Epoch 1/20


Training: 100%|██████████| 310/310 [01:22<00:00,  3.76it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.04it/s]


Train_loss = 5.121 | Val_loss = 4.880

Epoch 2/20


Training: 100%|██████████| 310/310 [01:22<00:00,  3.75it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.97it/s]


Train_loss = 4.578 | Val_loss = 4.414

Epoch 3/20


Training: 100%|██████████| 310/310 [01:23<00:00,  3.73it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.93it/s]


Train_loss = 3.966 | Val_loss = 3.893

Epoch 4/20


Training: 100%|██████████| 310/310 [01:24<00:00,  3.67it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.92it/s]


Train_loss = 3.361 | Val_loss = 3.639

Epoch 5/20


Training: 100%|██████████| 310/310 [01:24<00:00,  3.68it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.25it/s]


Train_loss = 2.746 | Val_loss = 3.198

Epoch 6/20


Training: 100%|██████████| 310/310 [01:24<00:00,  3.67it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.90it/s]


Train_loss = 2.235 | Val_loss = 3.105

Epoch 7/20


Training: 100%|██████████| 310/310 [01:23<00:00,  3.72it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.99it/s]


Train_loss = 1.804 | Val_loss = 3.033

Epoch 8/20


Training: 100%|██████████| 310/310 [01:22<00:00,  3.74it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.91it/s]


Train_loss = 1.408 | Val_loss = 3.135

Epoch 9/20


Training: 100%|██████████| 310/310 [01:23<00:00,  3.71it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.10it/s]


Train_loss = 1.126 | Val_loss = 3.029

Epoch 10/20


Training: 100%|██████████| 310/310 [01:23<00:00,  3.73it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.04it/s]


Train_loss = 0.963 | Val_loss = 3.083

Epoch 11/20


Training: 100%|██████████| 310/310 [01:22<00:00,  3.74it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.00it/s]

Train_loss = 0.769 | Val_loss = 3.182
Early stopping triggered.





## IndicBERT

In [None]:
# Load indicbert tokenizer and model
indicbert_tokenizer = AutoTokenizer.from_pretrained(model_names['indicbert'])
indicbert_model = AutoModelForQuestionAnswering.from_pretrained(model_names['indicbert']).to(device)

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Prepare Datasets and Dataloaders
indicbert_train_dataset = QADataset(train_data, indicbert_tokenizer, params['max_length'])
indicbert_val_dataset = QADataset(val_data, indicbert_tokenizer, params['max_length'])
indicbert_test_dataset = QADataset(test_data, indicbert_tokenizer, params['max_length'])

indicbert_train_loader = DataLoader(indicbert_train_dataset, batch_size=params['batch_size'], shuffle=True)
indicbert_val_loader = DataLoader(indicbert_val_dataset, batch_size=params['batch_size'])
indicbert_test_loader = DataLoader(indicbert_test_dataset, batch_size=1)

model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

In [None]:
# Optimizer and Scheduler
indicbert_optimizer = AdamW(indicbert_model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
total_steps = len(indicbert_train_loader) * params['epochs']
indicbert_scheduler = get_linear_schedule_with_warmup(indicbert_optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
# Initialize tracking variables
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 2
patience_counter = 0

for epoch in range(params['epochs']):
    print(f"\nEpoch {epoch + 1}/{params['epochs']}")

    # ---- TRAINING PHASE ----
    indicbert_model.train()
    total_train_loss = 0
    start_time = time.time()

    train_loop = tqdm(indicbert_train_loader, desc="Training", leave=True)
    for batch in train_loop:
        indicbert_optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = indicbert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        indicbert_optimizer.step()
        indicbert_scheduler.step()

    avg_train_loss = total_train_loss / len(indicbert_train_loader)
    train_losses.append(avg_train_loss)

    # ---- VALIDATION PHASE ----
    indicbert_model.eval()
    total_val_loss = 0

    val_loop = tqdm(indicbert_val_loader, desc="Validation", leave=True)
    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = indicbert_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(indicbert_val_loader)
    val_losses.append(avg_val_loss)

    # ---- PRINT LOSSES ----
    print(f"Train_loss = {avg_train_loss:.3f} | Val_loss = {avg_val_loss:.3f}")

    # ---- EARLY STOPPING LOGIC ----
    print(f"[DEBUG] Current val loss: {avg_val_loss:.4f}, Best val loss: {best_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        print("Validation loss improved — saving model.")
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(indicbert_model.state_dict(), os.path.join(model_save_path, 'same_split_indicbert_model.pth'))
    else:
        patience_counter += 1
        print(f"No improvement. Patience counter: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


Epoch 1/20


Training: 100%|██████████| 310/310 [01:00<00:00,  5.15it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.25it/s]


Train_loss = 2.681 | Val_loss = 2.973
[DEBUG] Current val loss: 2.9726, Best val loss: 3.0288
Validation loss improved — saving model.

Epoch 2/20


Training: 100%|██████████| 310/310 [01:01<00:00,  5.06it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 16.19it/s]


Train_loss = 2.507 | Val_loss = 3.092
[DEBUG] Current val loss: 3.0918, Best val loss: 2.9726
No improvement. Patience counter: 1/2

Epoch 3/20


Training: 100%|██████████| 310/310 [01:01<00:00,  5.06it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 15.94it/s]

Train_loss = 2.380 | Val_loss = 3.010
[DEBUG] Current val loss: 3.0101, Best val loss: 2.9726
No improvement. Patience counter: 2/2
Early stopping triggered.





## mt5

In [None]:
# Load mt5 tokenizer and model
mt5_tokenizer = AutoTokenizer.from_pretrained(model_names['mt5'])
mt5_model = AutoModelForQuestionAnswering.from_pretrained(model_names['mt5']).to(device)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Some weights of MT5ForQuestionAnswering were not initialized from the model checkpoint at google/mt5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Prepare Datasets and Dataloaders
mt5_train_dataset = QADataset(train_data, mt5_tokenizer, params['max_length'])
mt5_val_dataset = QADataset(val_data, mt5_tokenizer, params['max_length'])
mt5_test_dataset = QADataset(test_data, mt5_tokenizer, params['max_length'])

mt5_train_loader = DataLoader(mt5_train_dataset, batch_size=params['batch_size'], shuffle=True)
mt5_val_loader = DataLoader(mt5_val_dataset, batch_size=params['batch_size'])
mt5_test_loader = DataLoader(mt5_test_dataset, batch_size=1)

In [None]:
# Optimizer and Scheduler
mt5_optimizer = AdamW(mt5_model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
total_steps = len(mt5_train_loader) * params['epochs']
mt5_scheduler = get_linear_schedule_with_warmup(mt5_optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:
train_losses = []
val_losses = []
best_val_loss = float('inf')  # <-- reset before each model's training loop
patience = 2
patience_counter = 0

for epoch in range(params['epochs']):
    print(f"\nEpoch {epoch + 1}/{params['epochs']}")

    # ---- TRAINING PHASE ----
    mt5_model.train()
    total_train_loss = 0
    start_time = time.time()

    train_loop = tqdm(mt5_train_loader, desc="Training", leave=True)
    for batch in train_loop:
        mt5_optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = mt5_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        mt5_optimizer.step()
        mt5_scheduler.step()

    avg_train_loss = total_train_loss / len(mt5_train_loader)
    train_losses.append(avg_train_loss)

    # ---- VALIDATION PHASE ----
    mt5_model.eval()
    total_val_loss = 0

    val_loop = tqdm(mt5_val_loader, desc="Validation", leave=True)
    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = mt5_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(mt5_val_loader)
    val_losses.append(avg_val_loss)

    # ---- PRINT LOSSES ----
    print(f"Train_loss = {avg_train_loss:.3f} | Val_loss = {avg_val_loss:.3f}")

    # ---- EARLY STOPPING LOGIC ----
    print(f"[DEBUG] Current val loss: {avg_val_loss:.4f}, Best val loss: {best_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        print("Validation loss improved — saving model.")
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(mt5_model.state_dict(), os.path.join(model_save_path, 'same_split_mt5_model.pth'))
    else:
        patience_counter += 1
        print(f"No improvement. Patience counter: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


Epoch 1/20


Training: 100%|██████████| 310/310 [01:10<00:00,  4.41it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 18.32it/s]


Train_loss = 5.621 | Val_loss = 5.368
[DEBUG] Current val loss: 5.3676, Best val loss: inf
Validation loss improved — saving model.

Epoch 2/20


Training: 100%|██████████| 310/310 [01:11<00:00,  4.34it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 17.83it/s]


Train_loss = 5.599 | Val_loss = 5.321
[DEBUG] Current val loss: 5.3213, Best val loss: 5.3676
Validation loss improved — saving model.

Epoch 3/20


Training: 100%|██████████| 310/310 [01:11<00:00,  4.33it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 18.00it/s]


Train_loss = 5.555 | Val_loss = 5.325
[DEBUG] Current val loss: 5.3245, Best val loss: 5.3213
No improvement. Patience counter: 1/2

Epoch 4/20


Training: 100%|██████████| 310/310 [01:08<00:00,  4.51it/s]
Validation: 100%|██████████| 39/39 [00:02<00:00, 18.04it/s]

Train_loss = 5.548 | Val_loss = 5.331
[DEBUG] Current val loss: 5.3313, Best val loss: 5.3213
No improvement. Patience counter: 2/2
Early stopping triggered.





Evaluation

## Evaluation

In [None]:
def compute_exact(a_gold, a_pred):
    return int(a_gold.strip() == a_pred.strip())

In [None]:
import collections

def compute_f1(a_gold, a_pred):
    gold_toks = a_gold.strip().split()
    pred_toks = a_pred.strip().split()
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0

    precision = num_same / len(pred_toks)
    recall = num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


In [None]:
def evaluate_qa_model(model, test_loader, tokenizer, device):
    model.eval()
    em_scores = []
    f1_scores = []
    all_preds = []
    all_trues = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            start_preds = torch.argmax(start_logits, dim=1).cpu().numpy()
            end_preds = torch.argmax(end_logits, dim=1).cpu().numpy()

            for i in range(len(start_preds)):
                start_idx = start_preds[i]
                end_idx = end_preds[i]

                input_id = batch['input_ids'][i]
                tokens = tokenizer.convert_ids_to_tokens(input_id[start_idx:end_idx + 1])
                prediction = tokenizer.convert_tokens_to_string(tokens)

                true_start = batch['start_positions'][i].cpu().item()
                true_end = batch['end_positions'][i].cpu().item()
                true_tokens = tokenizer.convert_ids_to_tokens(input_id[true_start:true_end + 1])
                true_answer = tokenizer.convert_tokens_to_string(true_tokens)

                em = compute_exact(true_answer, prediction)
                f1 = compute_f1(true_answer, prediction)

                em_scores.append(em)
                f1_scores.append(f1)

                all_preds.append((start_idx, end_idx))
                all_trues.append((true_start, true_end))

    avg_em = np.mean(em_scores) * 100
    avg_f1 = np.mean(f1_scores) * 100

    return avg_em, avg_f1


In [None]:
muril_em, muril_f1 = evaluate_qa_model(muril_model, muril_test_loader, muril_tokenizer, device)
indicbert_em, indicbert_f1 = evaluate_qa_model(indicbert_model, indicbert_test_loader, indicbert_tokenizer, device)
mt5_em, mt5_f1 = evaluate_qa_model(mt5_model, mt5_test_loader, mt5_tokenizer, device)

print("\n===== Evaluation Results =====")
print(f"MuRIL     => EM: {muril_em:.2f}% | F1: {muril_f1:.2f}%")
print(f"IndicBERT => EM: {indicbert_em:.2f}% | F1: {indicbert_f1:.2f}%")
print(f"mT5       => EM: {mt5_em:.2f}% | F1: {mt5_f1:.2f}%")



===== Evaluation Results =====
MuRIL     => EM: 27.85% | F1: 50.74%
IndicBERT => EM: 13.92% | F1: 15.63%
mT5       => EM: 0.00% | F1: 7.60%
