## SQuAD dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("stanfordu/stanford-question-answering-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/stanfordu/stanford-question-answering-dataset/versions/2


In [None]:
import os

# List the files in the dataset folder
print("Dataset downloaded to:", path)
print("Files inside the folder:")
print(os.listdir(path))


Dataset downloaded to: /root/.cache/kagglehub/datasets/stanfordu/stanford-question-answering-dataset/versions/2
Files inside the folder:
['dev-v1.1.json', 'train-v1.1.json']


In [None]:
import shutil

destination_path = "/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/squad"
os.makedirs(destination_path, exist_ok=True)

# Copy all files from downloaded path to your desired destination
for file in os.listdir(path):
    shutil.copy(os.path.join(path, file), destination_path)

print("✅ Dataset copied to:", destination_path)


✅ Dataset copied to: /content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/squad


## Training

In [None]:
!pip install --upgrade transformers



In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [None]:
!pip install datasets



In [None]:
import os
import torch
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
import numpy as np
from transformers.trainer_utils import get_last_checkpoint

In [None]:
# MODEL AND TOKENIZER SETUP
model_checkpoint = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
squad = load_dataset(
    "json",
    data_files={
        "train": "/content/drive/MyDrive/Final_FYP_Implementations/Datasets and EDA/squad-20250421T132538Z-001/squad/train-v1.1.json",
        "validation": "/content/drive/MyDrive/Final_FYP_Implementations/Datasets and EDA/squad-20250421T132538Z-001/squad/dev-v1.1.json"
    },
    field="data"
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import Dataset

def flatten_squad_dataset(dataset_split):
    flat_data = []
    for example in dataset_split:
        for paragraph in example["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                if len(qa["answers"]) == 0:
                    continue
                flat_data.append({
                    "context": context,
                    "question": qa["question"],
                    "answers": {
                        "text": [qa["answers"][0]["text"]],
                        "answer_start": [qa["answers"][0]["answer_start"]]
                    }
                })
    return Dataset.from_list(flat_data)

squad["train"] = flatten_squad_dataset(squad["train"])
squad["validation"] = flatten_squad_dataset(squad["validation"])


In [None]:
def preprocess_function(examples):
    questions = examples["question"]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        if len(answer["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# 4. APPLY TO SQuAD DATASETS
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
# After loading and processing SQuAD dataset
print("\n--- SQuAD Dataset Statistics ---")
print(f"Total SQuAD train examples: {len(squad['train'])}")
print(f"Total SQuAD validation examples: {len(squad['validation'])}")
print(f"Total SQuAD examples: {len(squad['train']) + len(squad['validation'])}")


--- SQuAD Dataset Statistics ---
Total SQuAD train examples: 87599
Total SQuAD validation examples: 10570
Total SQuAD examples: 98169


In [None]:
print("\n--- SQuAD Tokenized Dataset Statistics ---")
print(f"Tokenized SQuAD train examples: {len(tokenized_squad['train'])}")
print(f"Tokenized SQuAD validation examples: {len(tokenized_squad['validation'])}")


--- SQuAD Tokenized Dataset Statistics ---
Tokenized SQuAD train examples: 88706
Tokenized SQuAD validation examples: 10814


In [None]:
import inspect
from transformers import TrainingArguments

print(inspect.signature(TrainingArguments.__init__))



In [None]:
# 5. TRAINING ARGUMENTS - FIRST STAGE (ENGLISH SQUAD)
args = TrainingArguments(
    output_dir="qa-finetuned-squad",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to=[]  # Disable wandb, TensorBoard, etc.
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

  trainer = Trainer(


In [None]:
trainer.train()
# Save checkpoint after SQuAD training
trainer.save_model("/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/squad_finetuned_model")

Epoch,Training Loss,Validation Loss
1,1.131,1.123657
2,0.8376,0.948054


## Tamil Dataset

In [None]:
from datasets import DatasetDict

In [None]:
from datasets import load_dataset, Dataset

# Load your Tamil dataset (still nested structure)
raw_tamil = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/Tamil-Question-Answering-System/Dataset/formatted_finalized_data.json",
    field="data"
)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Step 2: Flatten the nested structure
def flatten_tamil_dataset(dataset_split):
    flat_data = []
    for example in dataset_split:
        for paragraph in example["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                if qa["is_impossible"]:
                    continue
                if len(qa["answers"]) == 0:
                    continue
                flat_data.append({
                    "context": context,
                    "question": qa["question"],
                    "answers": {
                        "text": [qa["answers"][0]["text"]],
                        "answer_start": [qa["answers"][0]["answer_start"]]
                    }
                })
    return Dataset.from_list(flat_data)

In [None]:
# Step 3: Flatten the full dataset
flattened_tamil = flatten_tamil_dataset(raw_tamil["train"])

In [None]:
# Step 4: Split into train and validation sets (e.g. 90%/10%)
tamil_dataset = flattened_tamil.train_test_split(test_size=0.1, seed=42)

In [None]:
print(tamil_dataset)


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 698
    })
    test: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 78
    })
})


In [None]:
tokenized_tamil = {
    split: tamil_dataset[split].map(preprocess_function, batched=True, remove_columns=tamil_dataset[split].column_names)
    for split in tamil_dataset
}


Map:   0%|          | 0/698 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [None]:
# After processing the Tamil dataset
print("\n--- Tamil Dataset Statistics ---")
print(f"Total Tamil train examples: {len(tamil_dataset['train'])}")
print(f"Total Tamil test examples: {len(tamil_dataset['test'])}")
print(f"Total Tamil examples: {len(tamil_dataset['train']) + len(tamil_dataset['test'])}")

# After tokenization, you can also check tokenized datasets
print("\n--- Tamil Tokenized Dataset Statistics ---")
print(f"Tokenized Tamil train examples: {len(tokenized_tamil['train'])}")
print(f"Tokenized Tamil test examples: {len(tokenized_tamil['test'])}")


--- Tamil Dataset Statistics ---
Total Tamil train examples: 698
Total Tamil test examples: 78
Total Tamil examples: 776

--- Tamil Tokenized Dataset Statistics ---
Tokenized Tamil train examples: 698
Tokenized Tamil test examples: 78


In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/squad_finetuned_model")

In [None]:
tamil_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/qa-finetuned-tamil",
    eval_strategy="epoch",  # or use logging_strategy if that's what's supported
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=tamil_args,
    train_dataset=tokenized_tamil["train"],
    eval_dataset=tokenized_tamil["test"],  # or ["validation"] if you renamed it
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.183585
2,1.114200,1.243976
3,0.721200,1.248988
4,0.541600,1.388566
5,0.402300,1.479727
6,0.291300,1.648902
7,0.264400,1.60627
8,0.213500,1.655365
9,0.213500,1.752766
10,0.174300,1.781871


TrainOutput(global_step=440, training_loss=0.4385417266325517, metrics={'train_runtime': 865.0634, 'train_samples_per_second': 8.069, 'train_steps_per_second': 0.509, 'total_flos': 1367888521512960.0, 'train_loss': 0.4385417266325517, 'epoch': 10.0})

In [None]:
model_path = "/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model"

# Save both model and tokenizer
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


('/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model/tokenizer_config.json',
 '/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model/special_tokens_map.json',
 '/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model/vocab.txt',
 '/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model/added_tokens.json',
 '/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model/tokenizer.json')

In [None]:
import evaluate

squad_metric = evaluate.load("squad")

def compute_metrics(p):
    return squad_metric.compute(predictions=p.predictions, references=p.label_ids)


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_path = "/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model"

model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
import os
import json
import torch
from tqdm import tqdm
from datasets import load_dataset, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import evaluate
from datetime import datetime

trained_model_path = "/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/final_tamil_qa_model"
test_dataset_path = "/content/drive/MyDrive/Tamil-Question-Answering-System/Dataset/squadTamil.json"  # change for new test data
output_dir = "/content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/test_results"
json_field_name = "data"  # set to None if your test file doesn't have a root field



In [None]:
# ====== LOAD MODEL ======
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
model = AutoModelForQuestionAnswering.from_pretrained(trained_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(197285, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# ====== LOAD TEST DATASET ======
try:
    raw_test_data = load_dataset("json", data_files=test_dataset_path, field=json_field_name)
except:
    raw_test_data = load_dataset("json", data_files=test_dataset_path)

def flatten_dataset(dataset_split):
    flat = []
    for example in dataset_split:
        for para in example["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                if qa.get("is_impossible", False) or not qa["answers"]:
                    continue
                flat.append({
                    "context": context,
                    "question": qa["question"],
                    "answers": {
                        "text": [qa["answers"][0]["text"]],
                        "answer_start": [qa["answers"][0]["answer_start"]]
                    }
                })
    return Dataset.from_list(flat)

# Flatten and limit to 500 examples
# Flatten and randomly select 500 examples
from random import seed
seed(42)  # Optional: for reproducibility
test_dataset = flatten_dataset(raw_test_data["train"]).shuffle(seed=42).select(range(2000))
print(f"✅ Test dataset loaded and randomly selected: {len(test_dataset)} examples")


✅ Test dataset loaded and randomly selected: 2000 examples


In [None]:
# ====== EVALUATE ======
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
metric = evaluate.load("squad")

predictions = []
references = []
correct = 0

for example in tqdm(test_dataset, desc="Evaluating"):
    pred = qa_pipeline({
        "context": example["context"],
        "question": example["question"]
    })

    pred_text = pred["answer"]
    true_text = example["answers"]["text"][0]

    # For metric computation
    q_id = str(hash(example["context"] + example["question"]))
    predictions.append({"id": q_id, "prediction_text": pred_text})
    references.append({"id": q_id, "answers": example["answers"]})

    if pred_text == true_text:
        correct += 1


Device set to use cuda:0
Evaluating: 100%|██████████| 2000/2000 [00:39<00:00, 51.19it/s]


In [None]:
# ====== METRICS ======
results = metric.compute(predictions=predictions, references=references)
accuracy = correct / len(test_dataset)

print("Evaluation Metrics:")
print(f"→ Exact Match (EM): {results['exact_match']:.2f}")
print(f"→ F1 Score: {results['f1']:.2f}")
print(f"→ Accuracy (Exact Match): {accuracy:.4f}")


📊 Evaluation Metrics:
→ Exact Match (EM): 24.85
→ F1 Score: 47.18
→ Accuracy (Exact Match): 0.2285


In [None]:
# ====== SAVE RESULTS ======
if output_dir:
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, "metrics.json"), "w") as f:
        json.dump({
            "exact_match": results["exact_match"],
            "f1": results["f1"],
            "accuracy": accuracy,
            "total_examples": len(test_dataset),
            "evaluated_on": test_dataset_path,
            "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }, f, indent=2)

    print(f"Metrics saved to: {output_dir}/metrics.json")

Metrics saved to: /content/drive/MyDrive/Tamil-Question-Answering-System/cross-lingual/test_results/metrics.json


In [None]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# ====== LOAD MODEL ======
trained_model_path = "/content/drive/MyDrive/Final_FYP_Implementations/Case 6_Cross-Lingual/final_tamil_qa_model-20250421T143508Z-001"
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
model = AutoModelForQuestionAnswering.from_pretrained(trained_model_path)

# Check device
device = 0 if torch.cuda.is_available() else -1

# Create QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

# ====== CUSTOM INFERENCE ======

# 🔹 Input your own paragraph and question here
context = """
சோழர் காலத்தில் கல்வி, கலை மற்றும் இலக்கியம் மிகவும் வளர்ச்சியடைந்தது. தஞ்சாவூரிலுள்ள பிரகதீஸ்வரர் கோயில் என்பது அந்தக் கால கட்டடக் கலைக்கு எடுத்துக்காட்டாகும்.
"""
question = "சோழர் கால கட்டடக் கலைக்கு எடுத்துக்காட்டாக என்ன குறிப்பிடப்பட்டுள்ளது?"

# 🔹 Run inference
result = qa_pipeline({
    "context": context,
    "question": question
})

# 🔹 Show answer
print(f"Question: {question}")
print(f"Answer: {result['answer']}")
print(f"Confidence Score: {result['score']:.4f}")


Device set to use cuda:0


Question: சோழர் கால கட்டடக் கலைக்கு எடுத்துக்காட்டாக என்ன குறிப்பிடப்பட்டுள்ளது?
Answer: பிரகதீஸ்வரர் கோயில்
Confidence Score: 0.5330
