# Day 4.2: Model Evaluation

**Goal:** Comprehensively evaluate the trained model on the test set

**What we'll do:**
1. Load the best trained model
2. Make predictions on test set
3. Generate confusion matrix
4. Compute classification metrics (precision, recall, F1-score)
5. Analyze per-class performance
6. Identify misclassifications
7. Save evaluation report

**Expected time:** 15-20 minutes

---

## 1. Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime

# TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Scikit-learn for metrics
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    accuracy_score,
    precision_recall_fscore_support
)

print(f"TensorFlow version: {tf.__version__}")
print(f"Libraries loaded successfully!")

## 2. Configuration

In [None]:
# Paths
TEST_CSV = "../../outputs/data_splits/test_split.csv"
MODEL_DIR = "../../outputs/models"
VIZ_DIR = "../../outputs/visualizations"
RESULTS_DIR = "../../outputs/evaluation_results"

os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(VIZ_DIR, exist_ok=True)

# Model parameters
IMG_SIZE = (128, 128)
BATCH_SIZE = 32

# Class names
CLASS_NAMES = ['glioma', 'meningioma', 'pituitary']
NUM_CLASSES = len(CLASS_NAMES)

print("Configuration loaded!")

## 3. Find and Load Best Model

This will load the most recent trained model from Day 4.1

In [None]:
# Find most recent model file
model_files = [f for f in os.listdir(MODEL_DIR) if f.endswith('.keras')]

if not model_files:
    raise FileNotFoundError("No trained model found! Please run day4_01_full_training.ipynb first.")

# Sort by modification time (most recent first)
model_files.sort(key=lambda x: os.path.getmtime(os.path.join(MODEL_DIR, x)), reverse=True)
latest_model = model_files[0]
model_path = os.path.join(MODEL_DIR, latest_model)

print(f"Loading model: {latest_model}")
print(f"Path: {model_path}")

# Load model
model = keras.models.load_model(model_path)
print("\n✅ Model loaded successfully!")

# Display model summary
model.summary()

## 4. Load Test Data

In [None]:
# Load test CSV
test_df = pd.read_csv(TEST_CSV)
test_df['label'] = test_df['label'].astype(str)

print(f"Test set: {len(test_df)} images from {test_df['patient_id'].nunique()} patients")
print("\nClass distribution:")
print(test_df['label'].value_counts())

# Create test generator (no augmentation)
test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='filepath',
    y_col='label',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    color_mode='grayscale',
    shuffle=False  # Important: keep order for predictions
)

print("\n✅ Test generator created!")

## 5. Make Predictions

In [None]:
print("🔮 Making predictions on test set...\n")

# Get predictions (probabilities for each class)
y_pred_probs = model.predict(test_generator, verbose=1)

# Get predicted class indices
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Get true class indices
y_true_classes = test_generator.classes

# Get class labels
class_labels = list(test_generator.class_indices.keys())

print(f"\n✅ Predictions completed!")
print(f"Shape of predictions: {y_pred_probs.shape}")
print(f"Number of samples: {len(y_pred_classes)}")
print(f"Class labels: {class_labels}")

## 6. Calculate Overall Accuracy

In [None]:
# Calculate accuracy
test_accuracy = accuracy_score(y_true_classes, y_pred_classes)

print("="*60)
print(f"📊 TEST SET ACCURACY: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("="*60)

# Calculate number of correct/incorrect predictions
correct = np.sum(y_pred_classes == y_true_classes)
incorrect = len(y_pred_classes) - correct

print(f"\nCorrect predictions: {correct} / {len(y_pred_classes)}")
print(f"Incorrect predictions: {incorrect} / {len(y_pred_classes)}")

## 7. Confusion Matrix

In [None]:
# Calculate confusion matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=class_labels,
    yticklabels=class_labels,
    cbar_kws={'label': 'Count'}
)
plt.title('Confusion Matrix - Test Set', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()

# Save figure
cm_path = os.path.join(VIZ_DIR, 'day4_02_confusion_matrix.png')
plt.savefig(cm_path, dpi=300, bbox_inches='tight')
print(f"Confusion matrix saved to: {cm_path}")

plt.show()

# Print confusion matrix
print("\nConfusion Matrix:")
cm_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)
print(cm_df)

## 8. Normalized Confusion Matrix

In [None]:
# Calculate normalized confusion matrix (percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot normalized confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_normalized, 
    annot=True, 
    fmt='.2%', 
    cmap='Blues',
    xticklabels=class_labels,
    yticklabels=class_labels,
    cbar_kws={'label': 'Percentage'}
)
plt.title('Normalized Confusion Matrix - Test Set', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()

# Save figure
cm_norm_path = os.path.join(VIZ_DIR, 'day4_02_confusion_matrix_normalized.png')
plt.savefig(cm_norm_path, dpi=300, bbox_inches='tight')
print(f"Normalized confusion matrix saved to: {cm_norm_path}")

plt.show()

## 9. Classification Report (Precision, Recall, F1-Score)

In [None]:
# Generate classification report
report = classification_report(
    y_true_classes, 
    y_pred_classes, 
    target_names=class_labels,
    digits=4
)

print("\n" + "="*70)
print("📊 CLASSIFICATION REPORT")
print("="*70)
print(report)

# Save report to file
report_path = os.path.join(RESULTS_DIR, 'classification_report.txt')
with open(report_path, 'w') as f:
    f.write("Classification Report - Brain Tumor Classification\n")
    f.write("="*70 + "\n\n")
    f.write(report)
    f.write("\n" + "="*70)

print(f"\nClassification report saved to: {report_path}")

## 10. Per-Class Performance Analysis

In [None]:
# Calculate per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_true_classes, 
    y_pred_classes, 
    labels=range(NUM_CLASSES)
)

# Create DataFrame
metrics_df = pd.DataFrame({
    'Class': class_labels,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})

print("\nPer-Class Performance:")
print(metrics_df.to_string(index=False))

# Plot per-class metrics
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(class_labels))
width = 0.25

ax.bar(x - width, precision, width, label='Precision', alpha=0.8)
ax.bar(x, recall, width, label='Recall', alpha=0.8)
ax.bar(x + width, f1, width, label='F1-Score', alpha=0.8)

ax.set_xlabel('Class', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Per-Class Performance Metrics', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(class_labels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(0, 1.1)

plt.tight_layout()

# Save figure
metrics_path = os.path.join(VIZ_DIR, 'day4_02_per_class_metrics.png')
plt.savefig(metrics_path, dpi=300, bbox_inches='tight')
print(f"\nPer-class metrics plot saved to: {metrics_path}")

plt.show()

## 11. Identify Misclassifications

In [None]:
# Find misclassified samples
misclassified_indices = np.where(y_pred_classes != y_true_classes)[0]

print(f"Total misclassifications: {len(misclassified_indices)} / {len(y_pred_classes)}")
print(f"Misclassification rate: {len(misclassified_indices)/len(y_pred_classes)*100:.2f}%")

# Create DataFrame of misclassifications
misclassified_data = []
for idx in misclassified_indices:
    true_label = class_labels[y_true_classes[idx]]
    pred_label = class_labels[y_pred_classes[idx]]
    confidence = y_pred_probs[idx][y_pred_classes[idx]]
    filepath = test_df.iloc[idx]['filepath']
    patient_id = test_df.iloc[idx]['patient_id']
    
    misclassified_data.append({
        'index': idx,
        'filepath': filepath,
        'patient_id': patient_id,
        'true_label': true_label,
        'predicted_label': pred_label,
        'confidence': confidence
    })

misclassified_df = pd.DataFrame(misclassified_data)

# Save misclassifications
misclass_csv = os.path.join(RESULTS_DIR, 'misclassified_samples.csv')
misclassified_df.to_csv(misclass_csv, index=False)
print(f"\nMisclassified samples saved to: {misclass_csv}")

# Show first 10 misclassifications
print("\nFirst 10 misclassifications:")
print(misclassified_df.head(10).to_string(index=False))

## 12. Analyze Prediction Confidence

In [None]:
# Get confidence scores for predicted classes
confidence_scores = np.max(y_pred_probs, axis=1)

# Separate correct and incorrect predictions
correct_mask = y_pred_classes == y_true_classes
correct_confidences = confidence_scores[correct_mask]
incorrect_confidences = confidence_scores[~correct_mask]

# Plot confidence distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
ax1.hist(correct_confidences, bins=30, alpha=0.7, label='Correct', color='green')
ax1.hist(incorrect_confidences, bins=30, alpha=0.7, label='Incorrect', color='red')
ax1.set_xlabel('Confidence Score', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Box plot
ax2.boxplot(
    [correct_confidences, incorrect_confidences],
    labels=['Correct', 'Incorrect'],
    patch_artist=True,
    boxprops=dict(facecolor='lightblue', alpha=0.7)
)
ax2.set_ylabel('Confidence Score', fontsize=12)
ax2.set_title('Confidence Score Comparison', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()

# Save figure
conf_path = os.path.join(VIZ_DIR, 'day4_02_confidence_analysis.png')
plt.savefig(conf_path, dpi=300, bbox_inches='tight')
print(f"Confidence analysis saved to: {conf_path}")

plt.show()

# Print statistics
print("\nConfidence Statistics:")
print(f"Correct predictions - Mean: {correct_confidences.mean():.4f}, Std: {correct_confidences.std():.4f}")
print(f"Incorrect predictions - Mean: {incorrect_confidences.mean():.4f}, Std: {incorrect_confidences.std():.4f}")

## 13. Save Complete Evaluation Results

In [None]:
# Compile all results
evaluation_results = {
    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'model_path': model_path,
    'test_samples': len(test_df),
    'test_accuracy': float(test_accuracy),
    'correct_predictions': int(correct),
    'incorrect_predictions': int(incorrect),
    'per_class_metrics': {
        class_labels[i]: {
            'precision': float(precision[i]),
            'recall': float(recall[i]),
            'f1_score': float(f1[i]),
            'support': int(support[i])
        }
        for i in range(NUM_CLASSES)
    },
    'confusion_matrix': cm.tolist(),
    'confidence_stats': {
        'correct': {
            'mean': float(correct_confidences.mean()),
            'std': float(correct_confidences.std()),
            'min': float(correct_confidences.min()),
            'max': float(correct_confidences.max())
        },
        'incorrect': {
            'mean': float(incorrect_confidences.mean()),
            'std': float(incorrect_confidences.std()),
            'min': float(incorrect_confidences.min()),
            'max': float(incorrect_confidences.max())
        }
    }
}

# Save to JSON
results_json = os.path.join(RESULTS_DIR, 'evaluation_results.json')
with open(results_json, 'w') as f:
    json.dump(evaluation_results, f, indent=2)

print(f"Complete evaluation results saved to: {results_json}")

## 14. Summary

In [None]:
print("\n" + "="*70)
print("🎉 DAY 4.2 COMPLETE - EVALUATION FINISHED!")
print("="*70)

print("\n📊 Summary:")
print(f"  Test Accuracy: {test_accuracy*100:.2f}%")
print(f"  Total Samples: {len(test_df)}")
print(f"  Correct: {correct}")
print(f"  Incorrect: {incorrect}")

print("\n📁 Files Created:")
print(f"  ✅ Confusion Matrix:           {cm_path}")
print(f"  ✅ Normalized Confusion Matrix: {cm_norm_path}")
print(f"  ✅ Per-Class Metrics Plot:     {metrics_path}")
print(f"  ✅ Confidence Analysis:         {conf_path}")
print(f"  ✅ Classification Report:       {report_path}")
print(f"  ✅ Misclassified Samples:       {misclass_csv}")
print(f"  ✅ Evaluation Results JSON:     {results_json}")

print("\n🎯 Key Findings:")
for i, class_name in enumerate(class_labels):
    print(f"  {class_name.capitalize()}:")
    print(f"    - Precision: {precision[i]:.4f}")
    print(f"    - Recall:    {recall[i]:.4f}")
    print(f"    - F1-Score:  {f1[i]:.4f}")

print("\n💡 Next Steps:")
print("  1. Review confusion matrix to see which classes are confused")
print("  2. Check misclassified_samples.csv to analyze errors")
print("  3. Run day4_03_predictions_analysis.ipynb to visualize predictions")

print("\n" + "="*70)