<a href="https://colab.research.google.com/github/kaglet/B-Trees-Tool/blob/main/multi_run_sem_rel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio transformers datasets scikit-learn scipy matplotlib seaborn lime



In [None]:
import os
import json
import torch
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
import lime.lime_text
import gc

In [None]:
# === CONFIG ===
MODEL_NAME = "Davlan/afro-xlmr-large"
NUM_EPOCHS = 4
BATCH_SIZE = 16  # Reduced to lower memory usage
LEARNING_RATE = 2e-5
MAX_LENGTH = 128
NUM_RUNS = 10
SAVE_DIR = "/content/drive/MyDrive/COS760/project"
CSV_PATH = os.path.join("./", "combined_dataset_cleaned.csv")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === SETUP ===
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = np.asarray(preds).squeeze()
    labels = np.asarray(labels, dtype=np.float32)
    return {
        "mse": mean_squared_error(labels, preds),
        "mae": mean_absolute_error(labels, preds),
        "pearson": pearsonr(labels, preds)[0],
        "spearman": spearmanr(labels, preds)[0],
    }

def tokenize_fn(tokenizer):
    def tokenize_batch(examples):
        return tokenizer(
            examples["sentence1"],
            examples["sentence2"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH
        )
    return tokenize_batch

def split_df(df, seed):
    from sklearn.model_selection import train_test_split
    train, temp = train_test_split(df, test_size=0.3, random_state=seed)
    val, test = train_test_split(temp, test_size=0.5, random_state=seed)
    return Dataset.from_pandas(train), Dataset.from_pandas(val), Dataset.from_pandas(test)

def train_and_evaluate(seed):
    print(f"\n🚀 Starting run with seed {seed}")
    set_seed(seed)
    df = pd.read_csv(CSV_PATH, encoding="ISO-8859-1")
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns='Unnamed: 0')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_ds, val_ds, test_ds = split_df(df, seed)

    # Tokenize and remove extra columns to avoid memory blow-up
    preprocess = tokenize_fn(tokenizer)
    train_ds = train_ds.map(preprocess, batched=True, remove_columns=[col for col in train_ds.column_names if col not in ["input_ids", "attention_mask"]])
    val_ds = val_ds.map(preprocess, batched=True, remove_columns=[col for col in val_ds.column_names if col not in ["input_ids", "attention_mask"]])
    test_ds = test_ds.map(preprocess, batched=True, remove_columns=[col for col in test_ds.column_names if col not in ["input_ids", "attention_mask"]])

    train_ds.set_format("torch")
    val_ds.set_format("torch")
    test_ds.set_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type="regression")
    model.to(DEVICE)

    args = TrainingArguments(
        output_dir=os.path.join(SAVE_DIR, f"checkpoints/seed_{seed}"),
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        save_strategy="no",
        logging_dir=os.path.join(SAVE_DIR, f"logs/seed_{seed}"),
        logging_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate(test_ds)
    print(f"✅ Done with seed {seed} | Pearson: {results['pearson']:.3f}")

    predictions = trainer.predict(test_ds)
    preds = predictions.predictions.squeeze()
    labels = predictions.label_ids.astype(np.float32)
    abs_error = np.abs(preds - labels)
    test_df = pd.DataFrame({"predicted": preds, "true": labels, "abs_error": abs_error})
    model_path = os.path.join(SAVE_DIR, f"checkpoints/seed_{seed}")

    # Save model
    trainer.save_model(model_path)

    # Cleanup to reduce memory
    del trainer, model, train_ds, val_ds, test_ds
    torch.cuda.empty_cache()
    gc.collect()

    return {"seed": seed, **results, "predictions_df": test_df, "model_path": model_path, "tokenizer_path": model_path}

all_results = []
for seed in range(NUM_RUNS):
    run_result = train_and_evaluate(seed)
    all_results.append(run_result)

results_df = pd.DataFrame([{k: v for k, v in r.items() if k != "predictions_df"} for r in all_results])
results_df.to_csv(os.path.join(SAVE_DIR, "multi_seed_results.csv"), index=False)

# Visualization code and best run logic could be kept in a separate notebook for clarity and memory safety



🚀 Starting run with seed 0


Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/937 [00:00<?, ? examples/s]

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
def create_comprehensive_visualizations(all_results, output_dir):
    print("\n--- Creating Comprehensive Visualizations ---")
    viz_dir = os.path.join(output_dir, "visualizations")
    os.makedirs(viz_dir, exist_ok=True)
    df_results = pd.DataFrame([r for r in all_results if not r.get('failed')])
    if df_results.empty:
        print("No successful runs to visualize.")
        return
    fig = plt.figure(figsize=(18, 10))
    fig.suptitle('Multi-Run Experiment Performance Analysis', fontsize=18, fontweight='bold')
    ax1 = plt.subplot(2, 2, 1)
    sns.boxplot(data=df_results[['pearson', 'spearman']], ax=ax1)
    ax1.set_title('Distribution of Correlation Metrics')
    ax1.set_ylabel('Coefficient')
    ax2 = plt.subplot(2, 2, 2)
    sns.boxplot(data=df_results[['mse', 'mae']], ax=ax2)
    ax2.set_title('Distribution of Error Metrics')
    ax2.set_ylabel('Error Value')
    ax3 = plt.subplot(2, 2, 3)
    ax3.plot(df_results['seed'], df_results['pearson'], 'o-', label='Pearson', markersize=8)
    ax3.plot(df_results['seed'], df_results['spearman'], 's-', label='Spearman', markersize=8)
    ax3.set_title('Performance vs. Random Seed')
    ax3.set_xlabel('Random Seed')
    ax3.set_ylabel('Correlation Coefficient')
    ax3.legend()
    ax3.grid(True, alpha=0.5)
    ax4 = plt.subplot(2, 2, 4)
    summary_stats = df_results[['pearson', 'spearman', 'mse', 'mae']].describe().round(4)
    ax4.axis('off')
    ax4.table(cellText=summary_stats.values, colLabels=summary_stats.columns, rowLabels=summary_stats.index, loc='center', cellLoc='center')
    ax4.set_title('Summary Statistics Across Runs')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    save_path = os.path.join(viz_dir, 'run_comparison.png')
    plt.savefig(save_path, dpi=300)
    plt.show()
    print(f"✅ Comparison visualization saved to {save_path}")

create_comprehensive_visualizations(all_results, SAVE_DIR)

best_run = max(all_results, key=lambda x: x["pearson"])
print(f"Best model: Seed {best_run['seed']} with Pearson {best_run['pearson']:.4f}")

lime_viz = AfroXLMRLIMEVisualizer(best_run['model'], best_run['tokenizer'], DEVICE)
best_pred = best_run['predictions_df'].sort_values(by='abs_error').iloc[0]
worst_pred = best_run['predictions_df'].sort_values(by='abs_error', ascending=False).iloc[0]

lime_viz.visualize_lime_explanation(best_pred['sentence1'], best_pred['sentence2'], "best_case", SAVE_DIR)
lime_viz.visualize_lime_explanation(worst_pred['sentence1'], worst_pred['sentence2'], "worst_case", SAVE_DIR)

create_attention_heatmap(best_run['model'], best_run['tokenizer'], best_pred['sentence1'], best_pred['sentence2'], "best_case", SAVE_DIR)
create_attention_heatmap(best_run['model'], best_run['tokenizer'], worst_pred['sentence1'], worst_pred['sentence2'], "worst_case", SAVE_DIR)
