# MindLens-AI — 05: Robustness Testing (RQ3)

**RQ3:** Are model predictions sensitive to small input perturbations (e.g., keyword removal or synonym replacement), indicating reliance on spurious correlations?

Tests: keyword removal, synonym replacement. Metric: prediction flip rate.

In [None]:
# Setup
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from src.model import load_model
from src.robustness import (
    remove_keywords, synonym_replace,
    robustness_test, TRIGGER_KEYWORDS,
)
from src.trust import compute_trust_score, categorize_trust, generate_trust_report

sns.set_theme(style="whitegrid")

# Load model + test data
model, vectorizer = load_model("../data/processed/model_artifacts.joblib")
split = joblib.load("../data/processed/test_split.joblib")
X_test = split["X_test"]
y_test = split["y_test"]
texts_test = split["texts_test"]

# Use 300 samples for robustness testing (or all if fewer)
n_samples = min(300, len(texts_test))
sample_texts = texts_test[:n_samples]

print(f"Testing robustness on {n_samples} samples ✓")
print(f"Trigger keywords ({len(TRIGGER_KEYWORDS)}): {sorted(TRIGGER_KEYWORDS)[:10]}...")

## 1. Perturbation Tests

In [None]:
# Test 1: Keyword Removal
print("Running keyword removal test...")
kw_results = robustness_test(model, vectorizer, sample_texts, remove_keywords)
print(f"  Flip rate: {kw_results['flip_rate']:.4f} ({kw_results['n_flips']}/{kw_results['n_total']})")

# Test 2: Synonym Replacement
print("\nRunning synonym replacement test...")
syn_results = robustness_test(model, vectorizer, sample_texts, synonym_replace, n=3, rng_seed=42)
print(f"  Flip rate: {syn_results['flip_rate']:.4f} ({syn_results['n_flips']}/{syn_results['n_total']})")

## 2. Flip Rate Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart: Flip rates
perturbations = ["Keyword Removal", "Synonym Replace"]
flip_rates = [kw_results["flip_rate"], syn_results["flip_rate"]]
colors = ["#F44336", "#FF9800"]

bars = axes[0].bar(perturbations, flip_rates, color=colors, edgecolor="black")
for bar, val in zip(bars, flip_rates):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                 f"{val:.3f}", ha="center", fontweight="bold", fontsize=12)
axes[0].axhline(0.15, color="gray", linestyle="--", alpha=0.7, label="Threshold (15%)")
axes[0].set_ylabel("Flip Rate")
axes[0].set_title("Prediction Flip Rate by Perturbation", fontsize=13)
axes[0].legend()

# Scatter: Original vs Perturbed confidence (keyword removal)
axes[1].scatter(kw_results["original_conf"], kw_results["perturbed_conf"],
                alpha=0.3, s=10, c=kw_results["flip_flags"].astype(int),
                cmap="RdYlGn_r", edgecolors="none")
axes[1].plot([0, 1], [0, 1], "k--", alpha=0.5)
axes[1].set_xlabel("Original Confidence")
axes[1].set_ylabel("Perturbed Confidence")
axes[1].set_title("Confidence Stability (Keyword Removal)", fontsize=13)

plt.tight_layout()
plt.show()

## 3. Trust Score Demo

In [None]:
# Load bias FPR gap from notebook 04
bias_data = joblib.load("../data/processed/bias_results.joblib")
bias_fpr_gap = bias_data["bias_fpr_gap"]

# Generate trust reports for 5 sample texts
print("=== Trust Score Reports ===\n")
for i in range(min(5, len(sample_texts))):
    report = generate_trust_report(
        sample_texts[i], model, vectorizer, bias_fpr_gap
    )
    print(f"Sample {i+1}:")
    print(f"  Prediction:  {report['prediction_label']}")
    print(f"  Confidence:  {report['confidence']:.3f}")
    print(f"  Flip Rate:   {report['flip_rate']:.1f}")
    print(f"  Trust Score: {report['trust_score']:.3f} ({report['trust_label']})")
    print(f"  Text: {sample_texts[i][:100]}...")
    print()

## 4. RQ3 Conclusion

**H3:** Small perturbations in input text will significantly affect predictions.

In [None]:
kw_flip = kw_results["flip_rate"]
syn_flip = syn_results["flip_rate"]

print("=" * 50)
print("RQ3 CONCLUSION")
print("=" * 50)
print(f"  Keyword Removal Flip Rate:  {kw_flip:.4f} ({kw_flip*100:.1f}%)")
print(f"  Synonym Replace Flip Rate:  {syn_flip:.4f} ({syn_flip*100:.1f}%)")
print(f"  Threshold:                  15%")
print()

if kw_flip > 0.15:
    print("✓ H3 SUPPORTED: Keyword removal causes >15% prediction flips.")
    print("  Model relies significantly on surface-level trigger words.")
else:
    print("✗ H3 NOT SUPPORTED: Model is relatively robust to keyword removal (<15% flips).")

if syn_flip > 0.15:
    print("  Synonym replacement also causes significant instability.")
else:
    print("  Model is robust to synonym replacement.")

# Save robustness results for Streamlit
joblib.dump({
    "kw_flip_rate": kw_flip,
    "syn_flip_rate": syn_flip,
}, "../data/processed/robustness_results.joblib")

print("\nRobustness testing notebook complete ✓")