## Install Dependancies

In [None]:
!pip install -q transformers torch datasets rouge-score sacrebleu grapheme

## Import libraries

In [None]:
import os
import torch
import string
import numpy as np
import pandas as pd
import collections
from pathlib import Path
from datetime import datetime
from rouge_score import rouge_scorer
import grapheme
import seaborn as sns
from sacrebleu.metrics import BLEU
from sklearn.metrics import f1_score
from transformers import Trainer, TrainingArguments, default_data_collator, EarlyStoppingCallback, AutoTokenizer, AutoModelForQuestionAnswering
import tensorflow as tf
from datasets import Dataset
from tqdm.auto import tqdm
from rouge_score import rouge_scorer
import nltk
import matplotlib.pyplot as plt
import json
import os
from transformers import TrainerCallback
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from google.colab import drive

In [None]:
drive.mount('/content/drive')

## Load datasets

In [None]:
train_full = pd.read_csv('/content/drive/MyDrive/Research_folder/data/train.csv')
val = pd.read_csv('/content/drive/MyDrive/Research_folder/data/dev.csv')
test = pd.read_csv('/content/drive/MyDrive/Research_folder/data/test.csv')

In [None]:
# percentage = 0.75
# size = 100
# train = train_full.sample(frac=percentage, random_state=42)  # random_state for reproducibility
# print(f"Loaded {len(train)} rows ({percentage*100}% of {len(train_full)} total)")

In [None]:
# translated training set
train.head(4)

In [None]:
# translated validation set
val.head(4)

In [None]:
# translated test set
test.head(4)

In [None]:
print(f"Translated Train set shape:{train.shape}")
print(f"Translated Validation set shape:{val.shape}")
print(f"Translated Test set shape:{test.shape}")

## Model loading

In [None]:
model_name = "deepset/xlm-roberta-large-squad2"
# Load the model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

## Feature Extraction

In [None]:
# Set up configuration for the tokenization
batch_size = 16
max_length = 512
doc_stride = 128  # Stride value to handle document overflow
pad_on_right = tokenizer.padding_side == "right"  # Check if padding is applied on the right side

In [None]:
def prepare_train_features(examples):
    # Remove unnecessary left whitespace from questions to avoid truncation issues
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize the examples, applying truncation, padding, and managing overflows with stride
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Map features back to their corresponding examples
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Initialize lists to store start and end positions for answers
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # Find the CLS token index for unanswered examples
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        # If no answer, label with CLS token index
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Calculate start and end token indices for the answer span
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Find the start and end token indices within the current span
            token_start_index = next(i for i, s in enumerate(sequence_ids) if s == (1 if pad_on_right else 0))
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Label with CLS index if answer is out of span
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Adjust start and end positions based on answer span
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
def convert_answers(r):
    # Extract the starting position
    start = r[0]
    # Extract the answer text
    text = r[1]
    # Return the answer in a dictionary
    return {
        'answer_start': [start],  # Start index of the answer as a list.
        'text': [text]           # The text of the answer as a list.
    }

In [None]:
train = train.sample(frac=1, random_state=42)
valid = val.sample(frac=1, random_state=42)

# Combines 'answer_start' and 'answer_text' into a structured dictionary.
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
valid['answers'] = valid[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

# Convert DataFrame into Hugging Face's Dataset
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(valid)

In [None]:
train_dataset[0]

In [None]:
# Tokenize the dataset
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

## Training Process

In [None]:
class ComputeMetricsCallback(TrainerCallback):
    def __init__(self):
        self.train_accuracies = []
        self.val_accuracies = []
        self.epoch_train_losses = []  # Stores last loss of each epoch
        self.current_epoch_train_losses = []  # Temporary storage
        self.val_losses = []
        self.completed_epochs = 0

    def on_epoch_begin(self, args, state, control, **kwargs):
        """Reset training loss tracking at start of each epoch"""
        self.current_epoch_train_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        # Track all training losses during epoch
        if "loss" in logs:
            self.current_epoch_train_losses.append(logs["loss"])

        # Capture validation metrics at epoch end
        if "eval_loss" in logs:
            self.val_losses.append(logs["eval_loss"])
        if "eval_accuracy" in logs:
            self.val_accuracies.append(logs["eval_accuracy"])

    def on_epoch_end(self, args, state, control, **kwargs):
        """Store the final training loss/accuracy of the completed epoch"""
        if self.current_epoch_train_losses:
            # Store last training loss of the epoch
            self.epoch_train_losses.append(self.current_epoch_train_losses[-1])

            # Optional: Calculate epoch-level training accuracy if available
            if hasattr(state, 'train_metrics') and 'accuracy' in state.train_metrics:
                self.train_accuracies.append(state.train_metrics['accuracy'])

        self.completed_epochs += 1

In [None]:
# Initialize callback
metrics_callback = ComputeMetricsCallback()

# Define training arguments
%env WANDB_DISABLED=True
args = TrainingArguments(
    f"sinhala-qa",
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [None]:
# Initialize the data collator
data_collator = default_data_collator

# Load model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set up early stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# Trainer setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=default_data_collator,
    tokenizer=tokenizer,
    callbacks=[early_stopping, metrics_callback],  # Add accuracy tracking
)

In [None]:
trainer.train()  # Start training the model

In [None]:
# Function to save metrics
def save_metrics(metrics_data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        json.dump(metrics_data, f)

# Function to load metrics
def load_metrics(filename="training_metrics/metrics.json"):
    if os.path.exists(filename):
        with open(filename, "r") as f:
            return json.load(f)
    return None

In [None]:
# Save metrics
metrics_data = {
    "train_losses": metrics_callback.epoch_train_losses,
    "val_losses": metrics_callback.val_losses,
    "train_accuracies": metrics_callback.train_accuracies,
    "val_accuracies": metrics_callback.val_accuracies,
}

In [None]:
def plot_loss(train_losses, val_losses, save_path=None):
    """
    Plots training and validation losses with aligned epochs.
    Assumes train_losses contains multiple values per epoch,
    while val_losses contains one value per epoch.
    """
    plt.figure(figsize=(8, 5))

    # Calculate how many training logs per epoch
    logs_per_epoch = len(train_losses) // len(val_losses)

    # Extract the last training loss of each epoch
    epoch_train_losses = [train_losses[(i+1)*logs_per_epoch - 1]
                         for i in range(len(val_losses))]

    # Plot with aligned epochs
    epochs = range(1, len(val_losses)+1)
    plt.plot(epochs, epoch_train_losses, label="Training Loss", marker='o')
    plt.plot(epochs, val_losses, label="Validation Loss", marker='o')

    plt.xticks(epochs)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title(f"Training/Validation Loss (Epochs 1-{len(val_losses)})")
    plt.legend()
    plt.grid()
    if save_path:
        plt.savefig(save_path)
    plt.show()

def plot_accuracy(train_accuracies, val_accuracies, save_path=None):
    """
    Plots training and validation accuracies.
    Assumes both are logged at the same frequency (per epoch).
    """
    plt.figure(figsize=(8, 5))
    epochs = range(1, len(val_accuracies)+1)

    plt.plot(epochs, train_accuracies, label="Training Accuracy", marker='o')
    plt.plot(epochs, val_accuracies, label="Validation Accuracy", marker='o')

    plt.xticks(epochs)
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title(f"Training/Validation Accuracy (Epochs 1-{len(val_accuracies)})")
    plt.legend()
    plt.grid()
    if save_path:
        plt.savefig(save_path)
    plt.show()

In [None]:
save_path_loss="/content/drive/MyDrive/Research_folder/outputs/Results/XLM-R-50-metrics_loss_plot.png"

In [None]:
# Check exact values at key epochs
print(f"Final Training Loss: {metrics_callback.epoch_train_losses[-1]:.4f}")
print(f"Final Validation Loss: {metrics_callback.val_losses[-1]:.4f}")
print(f"Gap: {metrics_callback.val_losses[-1] - metrics_callback.epoch_train_losses[-1]:.4f}")

In [None]:
# Plot Loss and Accuracy
plot_loss(metrics_callback.epoch_train_losses, metrics_callback.val_losses)

In [None]:
save_metrics(metrics_data, "/content/drive/MyDrive/Research_folder/outputs/metrics/size/XLM-R-75-metrics.json") # update path

In [None]:
trainer.save_model("/content/drive/MyDrive/Research_folder/models/XLM-R-75-metrics")  # Save the trained model # update path

## Validation process

In [None]:
def prepare_validation_features(examples):
    # Remove leading whitespace from questions to avoid unnecessary token space usage
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize questions and contexts while handling long texts with a sliding window (stride)
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],  # Choose question or context based on padding side
        examples["context" if pad_on_right else "question"],  # Opposite choice for the second input
        truncation="only_second" if pad_on_right else "only_first",  # Truncate only the context (or question if needed)
        max_length=max_length,  # Set max sequence length
        stride=doc_stride,  # Define overlap between chunks for long contexts
        return_overflowing_tokens=True,  # Return multiple input chunks for long contexts
        return_offsets_mapping=True,  # Keep track of token positions in original text
        padding="max_length",  # Ensure all sequences have the same length
    )

    # Map each generated tokenized example to its original sample index
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Create a list to store the example ID corresponding to each tokenized input
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):  # Iterate through tokenized samples
        # Identify whether the sequence contains context (1) or question (0)
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0  # Set correct index based on padding side

        # Find the original example index that this tokenized input corresponds to
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])  # Store example ID

        # Adjust offset mapping: Set offsets to None for non-context tokens
        tokenized_examples["offset_mapping"][i] = [
            (offset if sequence_ids[k] == context_index else None)  # Keep offsets only for context tokens
            for k, offset in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
validation_features = valid_dataset.map(
    prepare_validation_features,  # Preprocess validation data
    batched=True,  # Process the dataset in batches
    remove_columns=valid_dataset.column_names  # Removes original columns
)

In [None]:
# Print the number of processed validation features
print(f"Number of validation features: {len(validation_features)}")

In [None]:
valid_dataset

In [None]:
# Create a smaller version of the validation features dataset
valid_feats_small = validation_features.map(
    lambda example: example,  # Lambda function returns the example
    remove_columns=['example_id', 'offset_mapping']  # Remove unnecessary columns for evaluation
)

In [None]:
valid_feats_small

In [None]:
# Make predictions on the validation dataset
raw_predictions = trainer.predict(valid_feats_small)

In [None]:
# Get the raw predictions
raw_predictions[0]

In [None]:
examples = valid_dataset
features = validation_features

# Map example_id to its index in the examples dataset
example_id_to_index = {k: i for i, k in enumerate(examples["id"])}

# Initialize defaultdict to store feature indices per example
features_per_example = collections.defaultdict(list)

# Populate the dictionary with feature indices corresponding to each example_id
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)


In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions

    # Map example to its corresponding feature indices
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()
    print(f"Post-processing {len(examples)} examples with {len(features)} features.")

    # Iterate over all examples
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        min_null_score = None
        valid_answers = []

        context = example["context"]
        # Process each feature associated with the current example
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            # Track minimum null score (if needed)
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            min_null_score = max(min_null_score or float('-inf'), feature_null_score)

            # Get top n start and end logits
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            # Evaluate each possible answer span
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip invalid spans
                    if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    # Extract the answer text
                    start_char, end_char = offset_mapping[start_index][0], offset_mapping[end_index][1]
                    valid_answers.append({"score": start_logits[start_index] + end_logits[end_index], "text": context[start_char:end_char]})

        # Select the best answer
        best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0] if valid_answers else {"text": "", "score": 0.0}
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [None]:
# Post-process raw predictions to extract the final answers for each example in the validation dataset
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

In [None]:
# Create a DataFrame with questions and their corresponding predicted answers
prediction = pd.DataFrame([{"questions": x1['question'], "pred_answer": x2} for x1, x2 in zip(valid_dataset, [i for i in final_predictions.values()])])

In [None]:
# Assuming valid_dataset contains the original context, question, and true answer
prediction = pd.DataFrame([
    {
        "questions": x1['question'],
        "pred_answer": x2,
        "answer_text": x1['answer_text']  # Add the true answer here
    }
    for x1, x2 in zip(valid_dataset, [i for i in final_predictions.values()])
])

In [None]:
# Print dataframe with questions predicted answer
prediction

## Evaluation metric

In [None]:
# Function to remove punctuation from a string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# Function to compute Exact Match
def compute_exact_match(pred, truth):
    pred = remove_punctuation(pred.strip().lower())
    truth = remove_punctuation(truth.strip().lower())
    return int(pred == truth)

In [None]:
# Function to compute F1 Score
def compute_f1(pred, truth):
    # Remove punctuation from both pred and truth
    pred = remove_punctuation(pred.strip().lower())
    truth = remove_punctuation(truth.strip().lower())

    # Split the text into words for F1 calculation
    pred_tokens = pred.split()
    truth_tokens = truth.split()

    # Calculate the number of common tokens between the prediction and the truth
    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return 0.0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)

In [None]:
def tokenize_xlm_r(text):
    """Tokenize Sinhala text using XLM-R tokenizer and join tokens."""
    tokens = tokenizer.tokenize(text)
    return " ".join(tokens)  # Join tokens with space for ROUGE

In [None]:
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge2", "rouge3", "rouge4", "rougeL"], use_stemmer=False)

def compute_rouge(pred, truth):
    """Compute ROUGE-2 and ROUGE-L scores for a given prediction and truth."""
    pred_tokens = tokenize_xlm_r(pred)
    truth_tokens = tokenize_xlm_r(truth)
    scores = scorer.score(truth_tokens, pred_tokens)
    return scores['rouge2'].fmeasure, scores['rouge3'].fmeasure, scores['rouge4'].fmeasure, scores['rougeL'].fmeasure

In [None]:
# Splits Sinhala text into graphemes using the `grapheme` library.
def tokenize_graphemes(text):
    return grapheme.graphemes(text)  # Returns list of graphemes

In [None]:
def compute_bleu(pred, truth, n_gram):
    # Tokenize the prediction and truth
    pred_tokens = " ".join(tokenize_graphemes(pred))
    truth_tokens = [" ".join(tokenize_graphemes(truth))]  # Must be a list

    # Create BLEU scorer with proper settings
    bleu_scorer = BLEU(
        max_ngram_order=n_gram,
        smooth_method="exp",
        effective_order=True  # This addresses the warnings
    )
    bleu_score = bleu_scorer.sentence_score(pred_tokens, truth_tokens)
    return bleu_score.score

In [None]:
# add metric per model
def update_metrics_csv(model_name, metrics_dict, csv_file):
    """
    Update a CSV file with model metrics, preserving existing data.

    Args:
        model_name (str): Name of the model
        metrics_dict (dict): Dictionary of metrics to save
        csv_file (str): Path to CSV file
    """
    # Create new DataFrame with current metrics
    new_data = {
        'model': [model_name],
        'timestamp': [datetime.now().isoformat()],
        **metrics_dict
    }
    new_df = pd.DataFrame(new_data)

    # If file exists, load and append new data
    if Path(csv_file).exists():
        existing_df = pd.read_csv(csv_file)
        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        updated_df = new_df

    # Save to CSV
    updated_df.to_csv(csv_file, index=False)
    print(f"Metrics saved to {csv_file}")

### Validation set evaluation

In [None]:
# Calculate Exact Match for each row in the DataFrame
prediction['exact_match'] = prediction.apply(lambda row: compute_exact_match(row['pred_answer'], row['answer_text']), axis=1)
exact_match_score = prediction['exact_match'].mean() # percentage

# Calculate F1 Score for each row in the DataFrame
prediction['f1_scores'] = prediction.apply(lambda row: compute_f1(row['pred_answer'], row['answer_text']), axis=1)
f1_score_value = sum(prediction['f1_scores']) / len(prediction['f1_scores'])

# Print the results test set
print(f"Exact Match Score for val set: {exact_match_score:.4f}")
print(f"F1 Score for val set: {f1_score_value:.4f}")

In [None]:
# Calculate ROUGE-L and ROUGE-2
prediction[["rouge_2","rouge_3", "rouge_4", "rouge_L"]] = prediction.apply(
    lambda row: compute_rouge(row['pred_answer'], row['answer_text']), axis=1, result_type="expand"
)
avg_rouge_2 = prediction['rouge_2'].mean()
avg_rouge_3 = prediction['rouge_3'].mean()
avg_rouge_4 = prediction['rouge_4'].mean()
avg_rouge_L = prediction['rouge_L'].mean()

print(f"Average ROUGE-2 for Val set: {avg_rouge_2:.4f}")
print(f"Average ROUGE-3 for Val set: {avg_rouge_3:.4f}")
print(f"Average ROUGE-4 for Val set: {avg_rouge_4:.4f}")
print(f"Average ROUGE-L for Val set: {avg_rouge_L:.4f}")

In [None]:
prediction['bleu_1'] = prediction.apply(lambda row: compute_bleu(row['pred_answer'], row['answer_text'], n_gram=1), axis=1)
prediction['bleu_2'] = prediction.apply(lambda row: compute_bleu(row['pred_answer'], row['answer_text'], n_gram=2), axis=1)

avg_bleu_1 = prediction['bleu_1'].mean()
avg_bleu_2 = prediction['bleu_2'].mean()

print(f"Average BLEU-1: {avg_bleu_1:.4f}")
print(f"Average BLEU-2: {avg_bleu_2:.4f}")

In [None]:
metrics = {
    'avg_bleu_1': avg_bleu_1,
    'avg_bleu_2': avg_bleu_2,
    'avg_rouge_2': avg_rouge_2,
    'avg_rouge_3': avg_rouge_3,
    'avg_rouge_4': avg_rouge_4,
    'avg_rouge_L': avg_rouge_L,
    'exact_match_score': exact_match_score,
    'f1_score_value': f1_score_value
}
update_metrics_csv("XLM-R-75-metrics", metrics, csv_file = "/content/drive/MyDrive/Research_folder/outputs/Results/Main_Metric_Results_val.csv") #update model