# Model Evaluation & Visualization Notebook

## Quick Start Guide

### For Google Colab Users:
1. **Run the "Setup & Imports" cell** - It will auto-detect Colab
2. **Run the "Upload Result Files" cell** - Upload your 4 JSON result files:
 - `approach1_entity_ner_results.json`
 - `approach2_claim_ner_results.json`
 - `approach3_hybrid_llm_results.json`
 - `approach4_contrastive_results.json`
3. **Run remaining cells** to generate visualizations

### For Local Users:
1. Make sure result files are in the correct directories:
 - `experiments/approach1_entity_ner/results.json`
 - `experiments/approach2_claim_ner/results.json`
 - `experiments/approach3_hybrid_llm/results.json`
 - `experiments/approach4_contrastive/results.json`
2. Run all cells

---

# SMS Phishing Detection: Model Evaluation & Visualization

This notebook provides comprehensive evaluation and visualization of four different approaches:
1. **Approach 1**: Entity-First NER Pipeline
2. **Approach 2**: Claim-Phrase NER Pipeline
3. **Approach 3**: Hybrid NER + LLM Pipeline
4. **Approach 4**: Contrastive Learning Pipeline

**Dataset**: Mendeley SMS Phishing Dataset
- Training: 510 smishing messages
- Test: 128 smishing messages
- Note: This is ID (In-Distribution) evaluation only - no OOD test yet

## Setup & Imports

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Tuple
import warnings
import os

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Detect environment (Colab vs local)
try:
 import google.colab
 IN_COLAB = True
 print(" Running in Google Colab")
except:
 IN_COLAB = False
 print(" Running locally")

# Configuration
if IN_COLAB:
 # For Colab, use current directory
 EXPERIMENTS_DIR = Path('./experiments')
 OUTPUT_DIR = Path('./experiments/evaluation_outputs')
else:
 # For local, use relative path
 EXPERIMENTS_DIR = Path('../experiments')
 OUTPUT_DIR = Path('../experiments/evaluation_outputs')

# Create directories with parents=True
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(" Imports complete")
print(f" Experiments directory: {EXPERIMENTS_DIR.absolute()}")
print(f" Output directory: {OUTPUT_DIR.absolute()}")

: 

## Upload Result Files (For Google Colab)

In [None]:
if IN_COLAB:
 from google.colab import files
 import shutil
 
 print("=" * 80)
 print("UPLOAD YOUR RESULT FILES")
 print("=" * 80)
 print("\nPlease upload the 4 result JSON files:")
 print(" 1. approach1_entity_ner_results.json")
 print(" 2. approach2_claim_ner_results.json")
 print(" 3. approach3_hybrid_llm_results.json")
 print(" 4. approach4_contrastive_results.json")
 print("\nClick 'Choose Files' below and select all 4 files at once:")
 print("-" * 80)
 
 uploaded = files.upload()
 
 print("\n" + "=" * 80)
 print(f" Uploaded {len(uploaded)} file(s)")
 print("=" * 80)
 
 # Create experiment directories and move files
 file_mapping = {
 'approach1_entity_ner_results.json': 'approach1_entity_ner',
 'approach2_claim_ner_results.json': 'approach2_claim_ner',
 'approach3_hybrid_llm_results.json': 'approach3_hybrid_llm',
 'approach4_contrastive_results.json': 'approach4_contrastive'
 }
 
 for filename, dir_name in file_mapping.items():
 if filename in uploaded:
 # Create approach directory
 approach_dir = EXPERIMENTS_DIR / dir_name
 approach_dir.mkdir(parents=True, exist_ok=True)
 
 # Move uploaded file to correct location
 dest_file = approach_dir / 'results.json'
 shutil.move(filename, str(dest_file))
 print(f" Moved {filename} → {dest_file}")
 else:
 print(f" Warning: {filename} not uploaded")
 
 print("\n" + "=" * 80)
 print(" File organization complete!")
 print("=" * 80)
else:
 print("⏭ Skipping upload (running locally - files already in place)")

## Load Experimental Results

In [None]:
def load_results(approach_dir: Path) -> Dict:
 """Load results.json from an approach directory"""
 results_file = approach_dir / 'results.json'
 with open(results_file, 'r') as f:
 return json.load(f)

# Load all results
approaches = {
 'approach1_entity_ner': 'Entity NER',
 'approach2_claim_ner': 'Claim NER',
 'approach3_hybrid_llm': 'Hybrid NER+LLM',
 'approach4_contrastive': 'Contrastive Learning'
}

results = {}
for dir_name, display_name in approaches.items():
 try:
 results[display_name] = load_results(EXPERIMENTS_DIR / dir_name)
 print(f" Loaded: {display_name}")
 except Exception as e:
 print(f" Failed to load {display_name}: {e}")

print(f"\nTotal approaches loaded: {len(results)}")

: 

## Extract Key Metrics

In [None]:
def extract_metrics(results_dict: Dict) -> pd.DataFrame:
 """Extract key metrics from all approaches into a DataFrame"""
 
 metrics_data = []
 
 for approach_name, result in results_dict.items():
 test_metrics = result['test_metrics']
 
 # Get phishing detection metrics (end-to-end performance)
 if 'end_to_end_phishing_detection' in test_metrics:
 detection = test_metrics['end_to_end_phishing_detection']
 
 metrics_data.append({
 'Model': approach_name,
 'Accuracy': detection['accuracy'],
 'Precision': detection['precision'],
 'Recall': detection['recall'],
 'F1': detection['f1'],
 'Support': detection['support'],
 'TP': detection['confusion_matrix']['true_positive'],
 'FP': detection['confusion_matrix']['false_positive'],
 'TN': detection['confusion_matrix']['true_negative'],
 'FN': detection['confusion_matrix']['false_negative']
 })
 elif 'in_distribution_performance' in test_metrics:
 # Contrastive approach uses different structure
 detection = test_metrics['in_distribution_performance']
 
 metrics_data.append({
 'Model': approach_name,
 'Accuracy': detection['accuracy'],
 'Precision': detection['precision'],
 'Recall': detection['recall'],
 'F1': detection['f1'],
 'Support': detection['support'],
 'TP': detection['confusion_matrix']['true_positive'],
 'FP': detection['confusion_matrix']['false_positive'],
 'TN': detection['confusion_matrix']['true_negative'],
 'FN': detection['confusion_matrix']['false_negative']
 })
 
 df = pd.DataFrame(metrics_data)
 return df

df_metrics = extract_metrics(results)
df_metrics = df_metrics.sort_values('F1', ascending=False).reset_index(drop=True)

print("\n" + "="*80)
print("MODEL PERFORMANCE SUMMARY (ID Test Set - Mendeley Dataset)")
print("="*80)
print(df_metrics.to_string(index=False))
print("="*80)

## 1. Main Performance Table: ID Precision & Recall

This table shows the core metrics for in-distribution (ID) performance.
Since we don't have OOD test data yet, OOD columns are marked as N/A.

In [None]:
# Prepare data for the main table
fig, ax = plt.subplots(figsize=(14, 8))
ax.axis('tight')
ax.axis('off')

# Table data with model IDs
table_data = []
table_data.append(['Model ID', 'Model Name', 'ID Precision (%)', 'ID Recall (%)', 
 'ID F1 (%)', 'OOD Precision (%)', 'OOD Recall (%)'])

# Create model IDs
model_ids = {
 'Entity NER': 'app1_ent_ner',
 'Claim NER': 'app2_clm_ner',
 'Hybrid NER+LLM': 'app3_hyb_llm',
 'Contrastive Learning': 'app4_con_lrn'
}

for _, row in df_metrics.iterrows():
 model_id = model_ids.get(row['Model'], 'unknown')
 table_data.append([
 model_id,
 row['Model'],
 f"{row['Precision']*100:.2f}",
 f"{row['Recall']*100:.2f}",
 f"{row['F1']*100:.2f}",
 'N/A', # OOD not available yet
 'N/A' # OOD not available yet
 ])

# Color scheme
colors = []
colors.append(['#404040'] * 7) # Header row
for i in range(1, len(table_data)):
 row_colors = ['#f0f0f0'] * 7
 colors.append(row_colors)

table = ax.table(cellText=table_data, cellLoc='center', loc='center',
 cellColours=colors, bbox=[0, 0, 1, 1])

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.5)

# Style header row
for i in range(7):
 table[(0, i)].set_facecolor('#404040')
 table[(0, i)].set_text_props(weight='bold', color='white')

# Style data rows
for i in range(1, len(table_data)):
 for j in range(7):
 table[(i, j)].set_edgecolor('white')
 table[(i, j)].set_linewidth(2)
 if j == 0: # Model ID column - bold
 table[(i, j)].set_text_props(weight='bold')
 if j >= 5: # OOD columns - italicize N/A
 table[(i, j)].set_text_props(style='italic', color='gray')

plt.title('ID Performance: Precision, Recall & F1 Comparison\n(OOD Results Pending)', 
 fontsize=16, fontweight='bold', pad=20)

plt.figtext(0.5, 0.02, 
 'Dataset: Mendeley SMS Phishing | Train: 510 smishing | Test: 128 smishing (ID only)',
 ha='center', fontsize=9, style='italic', color='#2c3e50')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'performance_comparison_table.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print("\n Table saved as 'performance_comparison_table.png'")
print("\nNote: OOD evaluation pending - current results are ID (in-distribution) only")

## 2. Performance Bar Charts

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Performance Comparison (ID Test Set)', fontsize=18, fontweight='bold', y=0.995)

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1']
colors_map = {
 'Entity NER': '#3498db',
 'Claim NER': '#e74c3c',
 'Hybrid NER+LLM': '#2ecc71',
 'Contrastive Learning': '#f39c12'
}

for idx, metric in enumerate(metrics_to_plot):
 ax = axes[idx // 2, idx % 2]
 
 bars = ax.bar(df_metrics['Model'], df_metrics[metric], 
 color=[colors_map[m] for m in df_metrics['Model']],
 alpha=0.8, edgecolor='black', linewidth=1.5)
 
 # Add value labels on bars
 for bar in bars:
 height = bar.get_height()
 ax.text(bar.get_x() + bar.get_width()/2., height,
 f'{height:.3f}',
 ha='center', va='bottom', fontsize=11, fontweight='bold')
 
 ax.set_ylabel(metric, fontsize=12, fontweight='bold')
 ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold', pad=10)
 ax.set_ylim([0, 1.1])
 ax.tick_params(axis='x', rotation=45)
 ax.grid(axis='y', alpha=0.3, linestyle='--')
 
 # Add horizontal line at 0.9 for reference
 ax.axhline(y=0.9, color='red', linestyle='--', alpha=0.5, linewidth=1)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'performance_bars.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print(" Bar charts saved as 'performance_bars.png'")

## 3. Confusion Matrix Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 14))
fig.suptitle('Confusion Matrices - All Approaches', fontsize=18, fontweight='bold', y=0.995)

for idx, (_, row) in enumerate(df_metrics.iterrows()):
 ax = axes[idx // 2, idx % 2]
 
 # Create confusion matrix
 cm = np.array([[row['TP'], row['FP']],
 [row['FN'], row['TN']]])
 
 # Plot heatmap
 sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
 xticklabels=['Predicted Smishing', 'Predicted Ham'],
 yticklabels=['Actual Smishing', 'Actual Ham'],
 ax=ax, cbar_kws={'label': 'Count'},
 square=True, linewidths=2, linecolor='black')
 
 ax.set_title(f'{row["Model"]}\n(Acc: {row["Accuracy"]:.3f}, F1: {row["F1"]:.3f})',
 fontsize=13, fontweight='bold', pad=10)
 ax.set_ylabel('Actual', fontsize=11, fontweight='bold')
 ax.set_xlabel('Predicted', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'confusion_matrices.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print(" Confusion matrices saved as 'confusion_matrices.png'")

## 4. Training Dynamics (Loss Curves)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Training Dynamics - Loss Curves', fontsize=18, fontweight='bold', y=0.995)

approach_names = list(results.keys())

for idx, approach_name in enumerate(approach_names):
 ax = axes[idx // 2, idx % 2]
 
 train_metrics = results[approach_name]['train_metrics']
 
 # Extract loss curves
 if 'learning_curves' in train_metrics:
 train_losses = train_metrics['learning_curves']['train_losses']
 val_losses = train_metrics['learning_curves']['val_losses']
 epochs = range(1, len(train_losses) + 1)
 
 ax.plot(epochs, train_losses, marker='o', linewidth=2.5, 
 label='Training Loss', color='#3498db', markersize=6)
 ax.plot(epochs, val_losses, marker='s', linewidth=2.5, 
 label='Validation Loss', color='#e74c3c', markersize=6)
 
 # Mark best epoch
 best_epoch = train_metrics.get('best_epoch', len(epochs))
 ax.axvline(x=best_epoch, color='green', linestyle='--', 
 alpha=0.7, linewidth=2, label=f'Best Epoch ({best_epoch})')
 
 ax.set_xlabel('Epoch', fontsize=12, fontweight='bold')
 ax.set_ylabel('Loss', fontsize=12, fontweight='bold')
 ax.set_title(f'{approach_name}', fontsize=13, fontweight='bold', pad=10)
 ax.legend(loc='upper right', fontsize=10)
 ax.grid(True, alpha=0.3, linestyle='--')
 
 # Add early stopping info
 if train_metrics.get('early_stopped', False):
 ax.text(0.02, 0.98, 'Early Stopped', 
 transform=ax.transAxes, fontsize=10,
 verticalalignment='top',
 bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'training_loss_curves.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print(" Loss curves saved as 'training_loss_curves.png'")

## 5. Entity/Claim-Level Performance (Where Applicable)

In [None]:
# Entity NER performance breakdown
entity_ner_results = results.get('Entity NER', {})
if 'test_metrics' in entity_ner_results and 'entity_level' in entity_ner_results['test_metrics']:
 entity_metrics = entity_ner_results['test_metrics']['entity_level']['per_entity']
 
 # Create DataFrame
 entity_data = []
 for entity_type, metrics in entity_metrics.items():
 entity_data.append({
 'Entity': entity_type,
 'Precision': metrics['precision'],
 'Recall': metrics['recall'],
 'F1': metrics['f1'],
 'Support': metrics['support']
 })
 
 df_entities = pd.DataFrame(entity_data).sort_values('F1', ascending=False)
 
 # Plot
 fig, ax = plt.subplots(figsize=(14, 8))
 
 x = np.arange(len(df_entities))
 width = 0.25
 
 ax.bar(x - width, df_entities['Precision'], width, label='Precision', 
 color='#3498db', alpha=0.8, edgecolor='black')
 ax.bar(x, df_entities['Recall'], width, label='Recall', 
 color='#e74c3c', alpha=0.8, edgecolor='black')
 ax.bar(x + width, df_entities['F1'], width, label='F1', 
 color='#2ecc71', alpha=0.8, edgecolor='black')
 
 ax.set_xlabel('Entity Type', fontsize=12, fontweight='bold')
 ax.set_ylabel('Score', fontsize=12, fontweight='bold')
 ax.set_title('Entity NER: Per-Entity Performance Breakdown', fontsize=16, fontweight='bold', pad=15)
 ax.set_xticks(x)
 ax.set_xticklabels(df_entities['Entity'], rotation=45, ha='right')
 ax.legend(fontsize=11)
 ax.grid(axis='y', alpha=0.3, linestyle='--')
 ax.set_ylim([0, 1.1])
 
 # Add support counts
 for i, (idx, row) in enumerate(df_entities.iterrows()):
 ax.text(i, 1.05, f"n={row['Support']}", 
 ha='center', fontsize=9, style='italic')
 
 plt.tight_layout()
 plt.savefig(OUTPUT_DIR / 'entity_ner_breakdown.png', dpi=300, bbox_inches='tight', facecolor='white')
 plt.show()
 
 print(" Entity breakdown saved as 'entity_ner_breakdown.png'")
 print("\nEntity Performance Summary:")
 print(df_entities.to_string(index=False))

## 6. Claim NER Performance Breakdown

In [None]:
# Claim NER performance breakdown
claim_ner_results = results.get('Claim NER', {})
if 'test_metrics' in claim_ner_results and 'claim_level' in claim_ner_results['test_metrics']:
 claim_metrics = claim_ner_results['test_metrics']['claim_level']['per_claim_type']
 
 # Create DataFrame
 claim_data = []
 for claim_type, metrics in claim_metrics.items():
 claim_data.append({
 'Claim Type': claim_type,
 'Precision': metrics['precision'],
 'Recall': metrics['recall'],
 'F1': metrics['f1'],
 'Support': metrics['support']
 })
 
 df_claims = pd.DataFrame(claim_data).sort_values('F1', ascending=False)
 
 # Plot
 fig, ax = plt.subplots(figsize=(14, 8))
 
 x = np.arange(len(df_claims))
 width = 0.25
 
 ax.bar(x - width, df_claims['Precision'], width, label='Precision', 
 color='#9b59b6', alpha=0.8, edgecolor='black')
 ax.bar(x, df_claims['Recall'], width, label='Recall', 
 color='#e67e22', alpha=0.8, edgecolor='black')
 ax.bar(x + width, df_claims['F1'], width, label='F1', 
 color='#1abc9c', alpha=0.8, edgecolor='black')
 
 ax.set_xlabel('Claim Type', fontsize=12, fontweight='bold')
 ax.set_ylabel('Score', fontsize=12, fontweight='bold')
 ax.set_title('Claim NER: Per-Claim-Type Performance Breakdown', fontsize=16, fontweight='bold', pad=15)
 ax.set_xticks(x)
 ax.set_xticklabels(df_claims['Claim Type'], rotation=45, ha='right')
 ax.legend(fontsize=11)
 ax.grid(axis='y', alpha=0.3, linestyle='--')
 ax.set_ylim([0, 1.1])
 
 # Add support counts
 for i, (idx, row) in enumerate(df_claims.iterrows()):
 ax.text(i, 1.05, f"n={row['Support']}", 
 ha='center', fontsize=9, style='italic')
 
 plt.tight_layout()
 plt.savefig(OUTPUT_DIR / 'claim_ner_breakdown.png', dpi=300, bbox_inches='tight', facecolor='white')
 plt.show()
 
 print(" Claim breakdown saved as 'claim_ner_breakdown.png'")
 print("\nClaim Performance Summary:")
 print(df_claims.to_string(index=False))

## 7. Inference Time Comparison

In [None]:
# Extract inference times
inference_data = []

for approach_name, result in results.items():
 test_metrics = result['test_metrics']
 
 # Different approaches store inference time differently
 if 'inference_time_ms' in test_metrics:
 inf_time = test_metrics['inference_time_ms']
 inference_data.append({
 'Model': approach_name,
 'Mean (ms)': inf_time.get('mean', 0),
 'P50 (ms)': inf_time.get('p50', 0),
 'P95 (ms)': inf_time.get('p95', 0),
 'P99 (ms)': inf_time.get('p99', 0)
 })
 elif 'inference_pipeline' in test_metrics:
 inf_time = test_metrics['inference_pipeline']['total_time_ms']
 inference_data.append({
 'Model': approach_name,
 'Mean (ms)': inf_time.get('mean', 0),
 'P50 (ms)': inf_time.get('p50', 0),
 'P95 (ms)': inf_time.get('p95', 0),
 'P99 (ms)': inf_time.get('p99', 0)
 })
 elif 'inference_efficiency' in test_metrics:
 inf_time = test_metrics['inference_efficiency']['encoding_time_ms']
 total_time = test_metrics['inference_efficiency'].get('total_inference_ms', inf_time['mean'])
 inference_data.append({
 'Model': approach_name,
 'Mean (ms)': total_time if isinstance(total_time, (int, float)) else inf_time['mean'],
 'P50 (ms)': inf_time.get('p50', 0),
 'P95 (ms)': inf_time.get('p95', 0),
 'P99 (ms)': inf_time.get('p99', 0)
 })

if inference_data:
 df_inference = pd.DataFrame(inference_data).sort_values('Mean (ms)')
 
 fig, ax = plt.subplots(figsize=(12, 7))
 
 x = np.arange(len(df_inference))
 width = 0.2
 
 ax.bar(x - 1.5*width, df_inference['Mean (ms)'], width, label='Mean', 
 color='#3498db', alpha=0.8, edgecolor='black')
 ax.bar(x - 0.5*width, df_inference['P50 (ms)'], width, label='P50', 
 color='#2ecc71', alpha=0.8, edgecolor='black')
 ax.bar(x + 0.5*width, df_inference['P95 (ms)'], width, label='P95', 
 color='#f39c12', alpha=0.8, edgecolor='black')
 ax.bar(x + 1.5*width, df_inference['P99 (ms)'], width, label='P99', 
 color='#e74c3c', alpha=0.8, edgecolor='black')
 
 ax.set_xlabel('Model', fontsize=12, fontweight='bold')
 ax.set_ylabel('Inference Time (ms)', fontsize=12, fontweight='bold')
 ax.set_title('Inference Time Comparison (Lower is Better)', fontsize=16, fontweight='bold', pad=15)
 ax.set_xticks(x)
 ax.set_xticklabels(df_inference['Model'], rotation=45, ha='right')
 ax.legend(fontsize=10)
 ax.grid(axis='y', alpha=0.3, linestyle='--')
 
 plt.tight_layout()
 plt.savefig(OUTPUT_DIR / 'inference_time_comparison.png', dpi=300, bbox_inches='tight', facecolor='white')
 plt.show()
 
 print(" Inference time comparison saved as 'inference_time_comparison.png'")
 print("\nInference Time Summary:")
 print(df_inference.to_string(index=False))

## 8. Radar Chart: Multi-Dimensional Comparison

In [None]:
from math import pi

# Prepare data for radar chart
categories = ['Precision', 'Recall', 'F1', 'Accuracy']
N = len(categories)

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

for idx, (_, row) in enumerate(df_metrics.iterrows()):
 values = [row['Precision'], row['Recall'], row['F1'], row['Accuracy']]
 values += values[:1]
 
 ax.plot(angles, values, 'o-', linewidth=2.5, label=row['Model'], 
 color=list(colors_map.values())[idx], markersize=8)
 ax.fill(angles, values, alpha=0.15, color=list(colors_map.values())[idx])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=12, fontweight='bold')
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], fontsize=10)
ax.grid(True, linestyle='--', alpha=0.7)

plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=11)
plt.title('Multi-Metric Radar Comparison', fontsize=16, fontweight='bold', pad=30)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'radar_chart.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

print(" Radar chart saved as 'radar_chart.png'")

## 9. Summary Statistics & Key Insights

In [None]:
print("\n" + "="*80)
print("KEY INSIGHTS & SUMMARY")
print("="*80)

# Best performing model
best_f1 = df_metrics.iloc[0]
print(f"\n Best F1 Score: {best_f1['Model']} ({best_f1['F1']:.4f})")

best_precision = df_metrics.loc[df_metrics['Precision'].idxmax()]
print(f" Best Precision: {best_precision['Model']} ({best_precision['Precision']:.4f})")

best_recall = df_metrics.loc[df_metrics['Recall'].idxmax()]
print(f" Best Recall: {best_recall['Model']} ({best_recall['Recall']:.4f})")

best_accuracy = df_metrics.loc[df_metrics['Accuracy'].idxmax()]
print(f" Best Accuracy: {best_accuracy['Model']} ({best_accuracy['Accuracy']:.4f})")

# Error analysis
print("\n" + "-"*80)
print("ERROR ANALYSIS")
print("-"*80)

for _, row in df_metrics.iterrows():
 print(f"\n{row['Model']}:")
 print(f" False Positives: {row['FP']} (wrongly flagged as smishing)")
 print(f" False Negatives: {row['FN']} (missed smishing)")
 fpr = row['FP'] / (row['FP'] + row['TN']) if (row['FP'] + row['TN']) > 0 else 0
 fnr = row['FN'] / (row['FN'] + row['TP']) if (row['FN'] + row['TP']) > 0 else 0
 print(f" False Positive Rate: {fpr:.4f}")
 print(f" False Negative Rate: {fnr:.4f}")

# Dataset info
print("\n" + "-"*80)
print("DATASET INFORMATION")
print("-"*80)
print("Dataset: Mendeley SMS Phishing")
print("Training Set: 510 smishing messages")
print("Test Set: 128 smishing messages")
print("Evaluation Type: In-Distribution (ID) only")
print("OOD Evaluation: Pending (not yet available)")

print("\n" + "="*80)

## 10. Export Results Summary

In [None]:
# Export comprehensive summary to CSV
summary_file = OUTPUT_DIR / 'model_comparison_summary.csv'
df_metrics.to_csv(summary_file, index=False)
print(f" Summary exported to: {summary_file}")

# Create a detailed report
report = []
report.append("SMS PHISHING DETECTION - MODEL EVALUATION REPORT")
report.append("="*80)
report.append(f"\nDate: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Dataset: Mendeley SMS Phishing")
report.append(f"Train Size: 510 smishing messages")
report.append(f"Test Size: 128 smishing messages")
report.append(f"Evaluation: In-Distribution (ID) only\n")

report.append("\nMODEL RANKINGS BY F1 SCORE")
report.append("-"*80)
for i, (_, row) in enumerate(df_metrics.iterrows(), 1):
 report.append(f"{i}. {row['Model']}: F1={row['F1']:.4f} (P={row['Precision']:.4f}, R={row['Recall']:.4f})")

report.append("\n\nDETAILED METRICS")
report.append("="*80)
report.append(df_metrics.to_string(index=False))

report_text = "\n".join(report)
report_file = OUTPUT_DIR / 'evaluation_report.txt'
with open(report_file, 'w') as f:
 f.write(report_text)

print(f" Detailed report exported to: {report_file}")
print("\n" + "="*80)
print("ALL VISUALIZATIONS AND REPORTS GENERATED SUCCESSFULLY!")
print("="*80)
print(f"\nOutput directory: {OUTPUT_DIR}")
print("\nGenerated files:")
for file in sorted(OUTPUT_DIR.glob('*')):
 print(f" - {file.name}")