<a href="https://colab.research.google.com/github/kaglet/afrikaans_sem_rel/blob/main/afrikaans_sem_rel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers torch torchvision scikit-learn scipy pandas numpy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

KeyboardInterrupt: 

In [None]:
import json
import time
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch
import pandas as pd
import random
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
except ImportError:
    print("Not running in Google Colab or Drive mount not needed")

drive_root = "/content/drive/MyDrive/COS760/project"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("GPU not available - make sure to enable GPU in Runtime > Change runtime type")

# Configuration
SEED = 42
BATCH_SIZE = 32
NUM_EPOCHS = 4
LEARNING_RATE = 2e-5

# Load dataset
csv_path = os.path.join(drive_root, "combined_dataset_cleaned.csv")
df = pd.read_csv(csv_path, encoding='ISO-8859-1')
print("✅ Loaded dataset")
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
dataset = Dataset.from_pandas(df)
print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns}")
print(f"Sample data:")
print(df.head(3))

model_name = "Davlan/afro-xlmr-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Mounted at /content/drive
Google Drive mounted successfully!
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU device: Tesla T4
GPU memory: 15.8 GB
✅ Loaded dataset
Dataset shape: (6250, 3)
Dataset columns: Index(['sentence1', 'sentence2', 'label'], dtype='object')
Sample data:
                                           sentence1  \
0  Hoe, dink jy, is die pikkewyn aangepas om in w...   
1  Indien moontlik, vra ons vir donasies uit die ...   
2  Jy ry die eerste hoepel antikloksgewys tot by ...   

                                           sentence2  label  
0   Hoe, dink jy, stoor dit water vir lang tydperke?   0.56  
1  Indien die gemeenskap vir ons van enige donasi...   0.84  
2  Internet is die ander tegnologie wat dit moont...   0.19  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
def preprocess(ds):
    return tokenizer(ds["sentence1"], ds["sentence2"], truncation=True, padding="max_length", max_length = 128)

"""Set seeds for all random number generators to ensure reproducibility"""
def set_all_seeds(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"All seeds set to {seed}")

def compute_metrics(preds):
    predictions, labels = preds
    predictions = predictions.squeeze()
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson = pearsonr(labels, predictions)[0]
    spearman = spearmanr(labels, predictions)[0]
    return {"mse": mse, "mae": mae, "pearson": pearson, "spearman": spearman}

def split_dataset(df, train_size=0.7, val_size=0.15, test_size=0.15, seed=SEED):
    # First split: separate train from (val + test)
    train_df, temp_df = train_test_split(
        df,
        test_size=(val_size + test_size),
        random_state=seed,
        shuffle=True
    )

    # Second split: separate val from test
    val_ratio = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(
        temp_df,
        test_size=1 - val_ratio,
        random_state=seed,
        shuffle=True
    )

    return train_df, val_df, test_df

In [None]:
set_all_seeds(SEED)

All seeds set to 42


In [None]:
train_path = "train_afro-xlmr-final-ft"
eval_path = "eval_afro-xlmr-final-ft"
test_path = "test_afro-xlmr-final-ft"

if (
    os.path.exists(train_path) and os.path.isdir(train_path)
    and os.path.exists(eval_path) and os.path.isdir(eval_path)
    and os.path.exists(test_path) and os.path.isdir(test_path)
):
    print("✅ Tokenized datasets already exist. Loading from disk...")
    tokenized_train = load_from_disk(train_path)
    tokenized_eval = load_from_disk(eval_path)
    tokenized_test = load_from_disk(test_path)
else:
    print("Tokenized datasets not found. Creating splits and preprocessing...")

    # Split dataset
    train_df, eval_df, test_df = split_dataset(df)
    train_ds = Dataset.from_pandas(train_df)
    eval_ds = Dataset.from_pandas(eval_df)
    test_ds = Dataset.from_pandas(test_df)
    print("✅ Dataset splitting complete")

    # Tokenize
    tokenized_train = train_ds.map(preprocess, batched=True)
    tokenized_eval = eval_ds.map(preprocess, batched=True)
    tokenized_test = test_ds.map(preprocess, batched=True)
    print("✅ Tokenization complete")

    # Save
    print("Saving tokenized splits to disk")
    tokenized_train.save_to_disk(train_path)
    tokenized_eval.save_to_disk(eval_path)
    tokenized_test.save_to_disk(test_path)
    print("✅ Save complete")

Tokenized datasets not found. Creating splits and preprocessing...
✅ Dataset splitting complete


Map:   0%|          | 0/4375 [00:00<?, ? examples/s]

Map:   0%|          | 0/937 [00:00<?, ? examples/s]

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

✅ Tokenization complete
Saving tokenized splits to disk


Saving the dataset (0/1 shards):   0%|          | 0/4375 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/937 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/938 [00:00<?, ? examples/s]

✅ Save complete


In [None]:
def full_train(model_ckpt, tokenized_train, tokenized_eval, tokenized_test, drive_root):
    args = TrainingArguments(
        output_dir="afro-xlmr-final-ft",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        # weight_decay=0.01,
        num_train_epochs=4,
        save_strategy="epoch",
        report_to="none",
        logging_dir="./logs",
        logging_steps=10,  # Show logging every 10 steps
        logging_strategy="steps",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        compute_metrics=compute_metrics
    )

    print("✅ Training started")
    trainer.train()
    print("✅ Training finished")

    print("✅ Saving model to disk")
    trainer.save_model(os.path.join(drive_root, "checkpoint"))
    print("✅ Save complete!")

    print("✅ Evaluating by metrics...")
    results = trainer.evaluate()
    print("✅ Evaluation by metrics done!")

    print("✅ Saving metrics to disk")
    results_path = os.path.join(drive_root, "eval_results.json")
    with open(results_path, "w") as f:
        json.dump(results, f, indent=4)
    print("✅ Save complete!")

    return model, results

In [None]:
# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# Get model
# Check if folder exists
results_path = os.path.join(drive_root, "checkpoint")

if os.path.exists(results_path) and os.path.isdir(results_path):
    # Check for key model files
    model = AutoModelForSequenceClassification.from_pretrained(results_path)
    config_file = os.path.join(results_path, "config.json")
    print("✅ Model found, safe to load.")
else:
    print("❌ Model folder does not exist.")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, problem_type="regression").to(device)
    model, results = full_train(model, tokenized_train, tokenized_eval, tokenized_test, drive_root)
    print("Results:")
    print(results)

✅ Model found, safe to load.


In [None]:
def get_predictions_and_errors(model, dataset, tokenizer, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_inputs = []

    for entry in dataset:
        inputs = tokenizer(
            entry["sentence1"],
            entry["sentence2"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs).logits.squeeze().cpu().item()

        all_preds.append(output)
        all_labels.append(entry["label"])
        all_inputs.append((entry["sentence1"], entry["sentence2"]))

    df = pd.DataFrame(all_inputs, columns=["sentence1", "sentence2"])
    df["true_label"] = all_labels
    df["predicted"] = all_preds
    df["abs_error"] = np.abs(df["predicted"] - df["true_label"])
    df["squared_error"] = (df["predicted"] - df["true_label"]) ** 2

    return df

In [None]:
results_path = os.path.join(drive_root, "afroxlmr_predictions.csv")

if os.path.exists(results_path):
  results_df = pd.read_csv(results_path, encoding='ISO-8859-1')
else:
  results_df = get_predictions_and_errors(model, test_ds, tokenizer, device)
  results_df.to_csv("afroxlmr_predictions.csv", index=False)

# Get result cases to inspect further
high_similarity_pos = results_df.loc[results_df["predicted"].idxmax()]
low_similarity_pos = results_df.loc[results_df["predicted"].idxmin()]
largest_error_pos = results_df.loc[results_df["abs_error"].idxmax()]
smallest_error_pos = results_df.loc[results_df["abs_error"].idxmin()]

Top 5 Best AfroXLMR Predictions
                                             sentence1  \
841       Daar is baie visse in die see .. hou aan kyk   
671  My ma het net vir my gesê dat sy en my tante v...   
702                Dit werk nie met u telefoonlyn nie.   
130                        Man moeg slaap op die bank.   
436  @RealdonaldTrump Ek beveel @SeanHannity aan as...   

                                             sentence2  true_label  predicted  \
841        Onthou, daar is baie meer visse in die see.       0.812   0.811928   
671  Fabulous @hillaryclinton! U slaan op elke onde...       0.469   0.469090   
702           U telefoonlyn sal dit nie ondersteun nie       0.812   0.811908   
130                        'N Kat byt 'n mens se neus.       0.188   0.188337   
436  #Greta Trump is nou absoluut aan die brand! Hy...       0.375   0.375339   

     abs_error  
841   0.000072  
671   0.000090  
702   0.000092  
130   0.000337  
436   0.000339  

Top 5 Worst AfroXLMR Predicti

In [None]:
cases_to_analyze = [
    (high_similarity_pos, "High Similarity Case"),
    (low_similarity_pos, "Low Similarity Case"),
    (largest_error_pos, "Largest Prediction Error Case"),
    (smallest_error_pos, "Smallest Prediction Error Case")
]

# Choose layers to visualize attention for
layers_to_plot = [11, 12, 16, 23]

for i, (case_idx, case_name) in enumerate(cases_to_analyze):
    row = results_df[case_idx]
    sent1 = row["sentence1"]
    sent2 = row["sentence2"]
    true_score = row["true_label"]
    pred_score = row["predicted"]
    error = row["abs_error"]

    print(f"\n🔍 Example {i + 1}")
    print(f"Sentence 1: {sent1}")
    print(f"Sentence 2: {sent2}")
    print(f"True Label: {true_score:.3f}, Predicted: {pred_score:.3f}, Error: {error:.3f}")

    # Tokenize input pair
    inputs = tokenizer(sent1, sent2, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Forward pass with attention
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)

    attentions = outputs.attentions  # Tuple of layer attention tensors

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    for layer_j in layers_to_plot:
        layer_attn = attentions[layer_j][0]  # shape: [num_heads, seq_len, seq_len]
        mean_attention = layer_attn.mean(dim=0)  # Average across heads

        plt.figure(figsize=(10, 8))
        sns.heatmap(mean_attention, xticklabels=tokens, yticklabels=tokens, cmap="viridis")
        plt.title(f"Attention Heatmap - Layer {layer_j} (Example {i + 1})")
        plt.xlabel("Key (Attended To)")
        plt.ylabel("Query (Paying Attention)")
        plt.xticks(rotation=90)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()


NameError: name 'results_df' is not defined

In [None]:
import lime
import lime.lime_text
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class AfroXLMRLIMEVisualizer:
    """Create LIME explanations for Afro-XLMR semantic similarity predictions"""

    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.model.eval()  # Set model to evaluation mode

    def create_lime_explainer(self):
        """Create LIME text explainer"""
        return lime.lime_text.LimeTextExplainer(
            class_names=['Dissimilar', 'Similar'],
            feature_selection='forward_selection',
            verbose=True
        )

    def predict_similarity(self, sentence1, sentence2):
        """Get similarity prediction from the model"""
        inputs = self.tokenizer(
            sentence1,
            sentence2,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            output = self.model(**inputs).logits.squeeze().cpu().item()

        return output

    def explain_prediction(self, sentence1, sentence2, num_features=10):
        """
        Explain a single prediction using LIME
        """
        explainer = self.create_lime_explainer()

        # Create a prediction function for this specific pair
        def predict_pair(perturbed_texts):
            """Predict similarity for perturbed versions of the text pair"""
            predictions = []

            for text in perturbed_texts:
                # The perturbed text replaces sentence1, sentence2 stays the same
                similarity = self.predict_similarity(text, sentence2)

                # Convert to probability format (assuming similarity is between -1 and 1 or 0 and 1)
                # Adjust this based on your model's output range
                if similarity < 0:  # If similarity can be negative (like cosine similarity)
                    prob_similar = (similarity + 1) / 2
                else:  # If similarity is already between 0 and 1
                    prob_similar = similarity

                prob_dissimilar = 1 - prob_similar
                predictions.append([prob_dissimilar, prob_similar])

            return np.array(predictions)

        # Get explanation for sentence1 (how it contributes to similarity with sentence2)
        explanation1 = explainer.explain_instance(
            sentence1,
            predict_pair,
            num_features=num_features,
            num_samples=1000
        )

        # Create a prediction function for sentence2
        def predict_pair_reverse(perturbed_texts):
            """Predict similarity for perturbed versions of sentence2"""
            predictions = []

            for text in perturbed_texts:
                similarity = self.predict_similarity(sentence1, text)

                # Convert to probability format
                if similarity < 0:
                    prob_similar = (similarity + 1) / 2
                else:
                    prob_similar = similarity

                prob_dissimilar = 1 - prob_similar
                predictions.append([prob_dissimilar, prob_similar])

            return np.array(predictions)

        # Get explanation for sentence2
        explanation2 = explainer.explain_instance(
            sentence2,
            predict_pair_reverse,
            num_features=num_features,
            num_samples=1000
        )

        return explanation1, explanation2

    def visualize_lime_explanation(self, sentence1, sentence2, save_path=None):
        """Create comprehensive LIME visualization"""
        print(f"🔍 Analyzing sentence pair with LIME...")
        print(f"Sentence 1: {sentence1}")
        print(f"Sentence 2: {sentence2}")

        # Get actual prediction
        actual_similarity = self.predict_similarity(sentence1, sentence2)
        print(f"Actual similarity score: {actual_similarity:.4f}")

        # Get LIME explanations
        explanation1, explanation2 = self.explain_prediction(sentence1, sentence2)

        # Create visualization
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle(f'LIME Explanation for Afro-XLMR Semantic Similarity\nActual Score: {actual_similarity:.4f}',
                    fontsize=16, fontweight='bold')

        # Plot 1: Sentence 1 feature importance
        ax1 = axes[0, 0]
        features1 = explanation1.as_list()
        words1 = [f[0] for f in features1]
        importance1 = [f[1] for f in features1]

        colors1 = ['green' if imp > 0 else 'red' for imp in importance1]
        bars1 = ax1.barh(words1, importance1, color=colors1, alpha=0.7)
        ax1.set_xlabel('Feature Importance')
        ax1.set_title('Sentence 1: Word Contributions to Similarity')
        ax1.grid(True, alpha=0.3)

        # Add value labels on bars
        for bar, imp in zip(bars1, importance1):
            width = bar.get_width()
            ax1.text(width + (0.01 if width >= 0 else -0.01), bar.get_y() + bar.get_height()/2,
                    f'{imp:.3f}', ha='left' if width >= 0 else 'right', va='center')

        # Plot 2: Sentence 2 feature importance
        ax2 = axes[0, 1]
        features2 = explanation2.as_list()
        words2 = [f[0] for f in features2]
        importance2 = [f[1] for f in features2]

        colors2 = ['green' if imp > 0 else 'red' for imp in importance2]
        bars2 = ax2.barh(words2, importance2, color=colors2, alpha=0.7)
        ax2.set_xlabel('Feature Importance')
        ax2.set_title('Sentence 2: Word Contributions to Similarity')
        ax2.grid(True, alpha=0.3)

        # Add value labels on bars
        for bar, imp in zip(bars2, importance2):
            width = bar.get_width()
            ax2.text(width + (0.01 if width >= 0 else -0.01), bar.get_y() + bar.get_height()/2,
                    f'{imp:.3f}', ha='left' if width >= 0 else 'right', va='center')

        # Plot 3: Combined importance comparison
        ax3 = axes[1, 0]
        all_words = list(set(words1 + words2))

        # Get importance for each word in both sentences
        word_importance1 = {word: imp for word, imp in features1}
        word_importance2 = {word: imp for word, imp in features2}

        combined_importance = []
        for word in all_words:
            imp1 = word_importance1.get(word, 0)
            imp2 = word_importance2.get(word, 0)
            combined_importance.append((word, imp1, imp2))

        # Sort by absolute combined importance
        combined_importance.sort(key=lambda x: abs(x[1]) + abs(x[2]), reverse=True)

        # Take top 10 words
        top_words = combined_importance[:10]
        words = [item[0] for item in top_words]
        imp1_values = [item[1] for item in top_words]
        imp2_values = [item[2] for item in top_words]

        x = np.arange(len(words))
        width = 0.35

        bars1 = ax3.bar(x - width/2, imp1_values, width, label='Sentence 1', alpha=0.7)
        bars2 = ax3.bar(x + width/2, imp2_values, width, label='Sentence 2', alpha=0.7)

        ax3.set_xlabel('Words')
        ax3.set_ylabel('Importance')
        ax3.set_title('Top 10 Word Importance Comparison')
        ax3.set_xticks(x)
        ax3.set_xticklabels(words, rotation=45, ha='right')
        ax3.legend()
        ax3.grid(True, alpha=0.3)

        # Plot 4: Highlighted text visualization
        ax4 = axes[1, 1]
        ax4.axis('off')

        # Create highlighted text
        def highlight_text(text, word_importance, ax, y_pos, title):
            words = text.split()
            x_pos = 0.05
            ax.text(x_pos, y_pos + 0.1, title, fontweight='bold', fontsize=12, transform=ax.transAxes)

            for word in words:
                # Clean word for matching
                clean_word = re.sub(r'[^\w]', '', word.lower())
                importance = word_importance.get(clean_word, 0)

                # Color based on importance
                if importance > 0:
                    color = 'green'
                    alpha = min(0.3 + abs(importance) * 2, 1.0)
                elif importance < 0:
                    color = 'red'
                    alpha = min(0.3 + abs(importance) * 2, 1.0)
                else:
                    color = 'gray'
                    alpha = 0.1

                # Add word with background color
                bbox = dict(boxstyle="round,pad=0.1", facecolor=color, alpha=alpha)
                ax.text(x_pos, y_pos, word, bbox=bbox, fontsize=10, transform=ax.transAxes)

                # Update x position (approximate)
                x_pos += len(word) * 0.01 + 0.02
                if x_pos > 0.95:  # Wrap to next line
                    x_pos = 0.05
                    y_pos -= 0.08

        highlight_text(sentence1, word_importance1, ax4, 0.8, "Sentence 1 (Green=Positive, Red=Negative)")
        highlight_text(sentence2, word_importance2, ax4, 0.4, "Sentence 2 (Green=Positive, Red=Negative)")

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')

        plt.show()

        return explanation1, explanation2

def create_afroxlmr_lime_visualizations(model, tokenizer, device, results_df, output_dir="./results"):
    """Create LIME visualizations for Afro-XLMR model"""

    print("\n🔍 Creating LIME visualizations for Afro-XLMR model interpretability...")

    # Create LIME visualizer
    lime_viz = AfroXLMRLIMEVisualizer(model, tokenizer, device)

    # Create LIME output directory
    lime_dir = os.path.join(output_dir, "lime_explanations_afroxlmr")
    os.makedirs(lime_dir, exist_ok=True)

    # Find interesting cases for visualization
    high_similarity_idx = results_df['predicted'].idxmax()
    low_similarity_idx = results_df['predicted'].idxmin()
    largest_error_idx = results_df['abs_error'].idxmax()

    cases_to_analyze = [
        (high_similarity_idx, "High Similarity Case"),
        (low_similarity_idx, "Low Similarity Case"),
        (largest_error_idx, "Largest Prediction Error Case")
    ]

    print(f"\n🎯 Analyzing {len(cases_to_analyze)} interesting cases...")

    for idx, (case_idx, case_name) in enumerate(cases_to_analyze):
        row = results_df.iloc[case_idx]
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']
        actual_label = row['true_label']
        predicted_score = row['predicted']
        error = row['abs_error']

        print(f"\n📝 Case {idx + 1}: {case_name}")
        print(f"   Actual: {actual_label:.4f}, Predicted: {predicted_score:.4f}")
        print(f"   Error: {error:.4f}")
        print(f"   Sentence 1: {sentence1}")
        print(f"   Sentence 2: {sentence2}")

        # Create LIME visualization
        save_path = os.path.join(lime_dir, f"lime_explanation_afroxlmr_case_{idx + 1}.png")

        try:
            explanation1, explanation2 = lime_viz.visualize_lime_explanation(
                sentence1, sentence2, save_path=save_path
            )
            print(f"   ✅ LIME visualization saved to {save_path}")

            # Print top features
            print(f"   🔍 Top features for Sentence 1:")
            for word, importance in explanation1.as_list()[:5]:
                print(f"      '{word}': {importance:.4f}")

            print(f"   🔍 Top features for Sentence 2:")
            for word, importance in explanation2.as_list()[:5]:
                print(f"      '{word}': {importance:.4f}")

        except Exception as e:
            print(f"   ❌ Failed to create LIME visualization: {e}")
            continue

    print(f"\n✅ LIME analysis completed! Visualizations saved in {lime_dir}")

# Usage example - Add this to your existing code:

# First, install LIME if you haven't already
!pip install lime

# Then add this after your model evaluation:
create_afroxlmr_lime_visualizations(model, tokenizer, device, results_df, drive_root)

# You can also analyze specific examples manually:
def analyze_specific_example(model, tokenizer, device, sentence1, sentence2):
    """Analyze a specific sentence pair with LIME"""
    lime_viz = AfroXLMRLIMEVisualizer(model, tokenizer, device)
    explanation1, explanation2 = lime_viz.visualize_lime_explanation(sentence1, sentence2)
    return explanation1, explanation2

In [None]:
# CLS Embeddings to Logistic Regression Baseline
import joblib
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr, pearsonr

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

csv_path = os.path.join(drive_root, "combined_dataset_cleaned.csv")
df = pd.read_csv(csv_path, encoding='ISO-8859-1')

# Load AfroXLM-R tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-base")
model = AutoModel.from_pretrained("Davlan/afro-xlmr-base").to(device)
model.eval()

# Get CLS summary embeddings for batches of sentences
def get_embeddings(texts, tokenizer, model, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeds = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeds)
    return np.vstack(all_embeddings)

# Generate embeddings for both sentence columns
embeddings_1 = get_embeddings(df["sentence1"].fillna("").tolist(), tokenizer, model)
embeddings_2 = get_embeddings(df["sentence2"].fillna("").tolist(), tokenizer, model)

# Combine embeddings
X = np.hstack([embeddings_1, embeddings_2])

# Target labels
y = df["label"].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# After fitting your model
results_path = os.path.join(drive_root, "regression_model.joblib")
joblib.dump(regressor, results_path)
# Later, to load it back
# clf_loaded = joblib.load(model_path)

# Predict on test set
y_pred = regressor.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
pearson_corr, _ = pearsonr(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"R2 Score: {r2:.4f}")


In [None]:
# TFIDF Baseline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr

# Load dataset
df = pd.read_csv('./combined_dataset_cleaned.csv', encoding='ISO-8859-1')
df = df.dropna(subset=['sentence1', 'sentence2', 'label'])

# Combine sentence pairs into one input string
df['combined_text'] = df['sentence1'] + ' ' + df['sentence2']
X = df['combined_text']
y = df['label'].astype(float)

# Split into train/test
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF: unigrams + bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Train basic linear regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict and evaluate
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
pearson_corr, _ = pearsonr(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"R2 Score: {r2:.4f}")