<a href="https://colab.research.google.com/github/kaglet/afrikaans_sem_rel/blob/main/multi_run_sem_rel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio transformers datasets scikit-learn scipy matplotlib seaborn lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [None]:
import os
import json
import torch
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
import lime.lime_text

In [5]:
# === CONFIG ===
MODEL_NAME = "Davlan/afro-xlmr-large"
NUM_EPOCHS = 4
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
MAX_LENGTH = 128
NUM_RUNS = 10
SAVE_DIR = "/content/drive/MyDrive/COS760/project"
CSV_PATH = os.path.join("./", "combined_dataset_cleaned.csv")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === SETUP ===
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "pearson": pearsonr(labels, preds)[0],
        "spearman": spearmanr(labels, preds)[0],
    }

def tokenize_fn(examples, tokenizer):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

def split_df(df, seed):
    from sklearn.model_selection import train_test_split
    train, temp = train_test_split(df, test_size=0.3, random_state=seed)
    val, test = train_test_split(temp, test_size=0.5, random_state=seed)
    return Dataset.from_pandas(train), Dataset.from_pandas(val), Dataset.from_pandas(test)

def create_attention_heatmap(model, tokenizer, sent1, sent2, case_name, output_dir):
    print(f"\n🔍 Generating Attention Heatmap for: {case_name}")
    inputs = tokenizer(sent1, sent2, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    attentions = outputs.attentions
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    last_layer_attn = attentions[-1][0]
    mean_attention = last_layer_attn.mean(dim=0).cpu().numpy()

    plt.figure(figsize=(12, 10))
    sns.heatmap(mean_attention, xticklabels=tokens, yticklabels=tokens, cmap="viridis")
    plt.title(f'Attention Heatmap - Last Layer ({case_name})')
    plt.xticks(rotation=90)
    plt.tight_layout()
    save_path = os.path.join(output_dir, "visualizations", f"attention_{case_name.replace(' ', '_')}.png")
    plt.savefig(save_path)
    plt.show()
    print(f"✅ Attention map saved to {save_path}")

class AfroXLMRLIMEVisualizer:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.model.eval()

    def predictor(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
        with torch.no_grad():
            logits = self.model(**inputs).logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()
        return np.array([[1 - p, p] for p in probs])

    def visualize_lime_explanation(self, sentence1, sentence2, case_name, output_dir):
        print(f"\n🔍 Generating LIME Explanation for: {case_name}")
        text_to_explain = f"{sentence1} [SEP] {sentence2}"
        explainer = lime.lime_text.LimeTextExplainer(class_names=['dissimilar', 'similar'])
        explanation = explainer.explain_instance(
            text_to_explain,
            self.predictor,
            num_features=10
        )
        fig = explanation.as_pyplot_figure(label=1)
        plt.title(f'LIME Explanation for Similarity ({case_name})')
        plt.tight_layout()
        save_path = os.path.join(output_dir, "visualizations", f"lime_{case_name.replace(' ', '_')}.png")
        plt.savefig(save_path)
        plt.show()
        print(f"✅ LIME explanation saved to {save_path}")

def train_and_evaluate(seed):
    print(f"\n🚀 Starting run with seed {seed}")
    set_seed(seed)
    df = pd.read_csv(CSV_PATH, encoding="ISO-8859-1")
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns='Unnamed: 0')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_ds, val_ds, test_ds = split_df(df, seed)
    train_ds = train_ds.map(lambda x: tokenize_fn(x, tokenizer), batched=True)
    val_ds = val_ds.map(lambda x: tokenize_fn(x, tokenizer), batched=True)
    test_ds = test_ds.map(lambda x: tokenize_fn(x, tokenizer), batched=True)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type="regression", output_attentions=True)
    model.to(DEVICE)

    args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, f"checkpoints/seed_{seed}"),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        save_strategy="epoch",
        logging_dir=os.path.join(SAVE_DIR, f"logs/seed_{seed}"),
        logging_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate(test_ds)
    print(f"✅ Done with seed {seed} | Pearson: {results['pearson']:.3f}")

    predictions = trainer.predict(test_ds)
    preds = predictions.predictions.squeeze()
    labels = predictions.label_ids
    abs_error = np.abs(preds - labels)
    test_df = pd.DataFrame(test_ds)
    test_df['predicted'] = preds
    test_df['true'] = labels
    test_df['abs_error'] = abs_error
    model_path = os.path.join(SAVE_DIR, f"checkpoints/seed_{seed}")

    return {"seed": seed, **results, "predictions_df": test_df, "model_path": model_path, "tokenizer": tokenizer, "model": model}

In [None]:
all_results = []
for seed in range(NUM_RUNS):
    run_result = train_and_evaluate(seed)
    all_results.append(run_result)

results_df = pd.DataFrame([{k: v for k, v in r.items() if k not in ["predictions_df", "model_path", "model", "tokenizer"]} for r in all_results])
results_df.to_csv(os.path.join(SAVE_DIR, "multi_seed_results.csv"), index=False)


🚀 Starting run with seed 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/937 [00:00<?, ? examples/s]

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def create_comprehensive_visualizations(all_results, output_dir):
    print("\n--- Creating Comprehensive Visualizations ---")
    viz_dir = os.path.join(output_dir, "visualizations")
    os.makedirs(viz_dir, exist_ok=True)
    df_results = pd.DataFrame([r for r in all_results if not r.get('failed')])
    if df_results.empty:
        print("No successful runs to visualize.")
        return
    fig = plt.figure(figsize=(18, 10))
    fig.suptitle('Multi-Run Experiment Performance Analysis', fontsize=18, fontweight='bold')
    ax1 = plt.subplot(2, 2, 1)
    sns.boxplot(data=df_results[['pearson', 'spearman']], ax=ax1)
    ax1.set_title('Distribution of Correlation Metrics')
    ax1.set_ylabel('Coefficient')
    ax2 = plt.subplot(2, 2, 2)
    sns.boxplot(data=df_results[['mse', 'mae']], ax=ax2)
    ax2.set_title('Distribution of Error Metrics')
    ax2.set_ylabel('Error Value')
    ax3 = plt.subplot(2, 2, 3)
    ax3.plot(df_results['seed'], df_results['pearson'], 'o-', label='Pearson', markersize=8)
    ax3.plot(df_results['seed'], df_results['spearman'], 's-', label='Spearman', markersize=8)
    ax3.set_title('Performance vs. Random Seed')
    ax3.set_xlabel('Random Seed')
    ax3.set_ylabel('Correlation Coefficient')
    ax3.legend()
    ax3.grid(True, alpha=0.5)
    ax4 = plt.subplot(2, 2, 4)
    summary_stats = df_results[['pearson', 'spearman', 'mse', 'mae']].describe().round(4)
    ax4.axis('off')
    ax4.table(cellText=summary_stats.values, colLabels=summary_stats.columns, rowLabels=summary_stats.index, loc='center', cellLoc='center')
    ax4.set_title('Summary Statistics Across Runs')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    save_path = os.path.join(viz_dir, 'run_comparison.png')
    plt.savefig(save_path, dpi=300)
    plt.show()
    print(f"✅ Comparison visualization saved to {save_path}")

create_comprehensive_visualizations(all_results, SAVE_DIR)

best_run = max(all_results, key=lambda x: x["pearson"])
print(f"Best model: Seed {best_run['seed']} with Pearson {best_run['pearson']:.4f}")

lime_viz = AfroXLMRLIMEVisualizer(best_run['model'], best_run['tokenizer'], DEVICE)
best_pred = best_run['predictions_df'].sort_values(by='abs_error').iloc[0]
worst_pred = best_run['predictions_df'].sort_values(by='abs_error', ascending=False).iloc[0]

lime_viz.visualize_lime_explanation(best_pred['sentence1'], best_pred['sentence2'], "best_case", SAVE_DIR)
lime_viz.visualize_lime_explanation(worst_pred['sentence1'], worst_pred['sentence2'], "worst_case", SAVE_DIR)

create_attention_heatmap(best_run['model'], best_run['tokenizer'], best_pred['sentence1'], best_pred['sentence2'], "best_case", SAVE_DIR)
create_attention_heatmap(best_run['model'], best_run['tokenizer'], worst_pred['sentence1'], worst_pred['sentence2'], "worst_case", SAVE_DIR)
