In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, ConfusionMatrixDisplay, 
    roc_auc_score, roc_curve, auc
)
import time
import matplotlib.pyplot as plt
import os
from pathlib import Path
import numpy as np

# ------------------------
# GPU check
# ------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# ------------------------
# Custom RSUD Dataset for Evaluation
# ------------------------
class RSUDEvalDataset(Dataset):
    """RSUD dataset for model evaluation"""
    def __init__(self, img_dir, label_dir, transform=None):
        self.img_dir = Path(img_dir)
        self.label_dir = Path(label_dir)
        self.transform = transform
        
        # Get all image files
        self.image_files = sorted([f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png'))])
        
        # RSUD class names (13 classes)
        self.classes = [
            'Dilarang Berhenti', 'Dilarang Parkir', 'Dilarang Masuk',
            'Bahaya', 'Lampu Lalu Lintas Merah', 'Batas Kecepatan',
            'Wajib', 'Larangan Belok', 'Zona Pejalan Kaki',
            'Petunjuk Arah', 'Rambu Informasi', 'Hati-hati',
            'Zona Khusus'
        ]
        
        print(f"Found {len(self.image_files)} test images")
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load image
        img_name = self.image_files[idx]
        img_path = self.img_dir / img_name
        image = Image.open(img_path).convert('RGB')
        
        # Load label from YOLO format
        label_name = img_name.replace('.jpg', '.txt').replace('.png', '.txt')
        label_path = self.label_dir / label_name
        
        # Get first class label (for classification)
        label = 0  # default
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                first_line = f.readline().strip()
                if first_line:
                    label = int(first_line.split()[0])
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# ------------------------
# Dataset Setup
# ------------------------
base_path = "F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k"

# Use val set since test set might not have labels
img_dir = f"{base_path}/images/val"
label_dir = f"{base_path}/labels/val"

img_size = 224
batch_size = 32

transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_dataset = RSUDEvalDataset(img_dir, label_dir, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

class_names = test_dataset.classes
num_classes = len(class_names)

print(f"Number of classes: {num_classes}")
print(f"Test samples: {len(test_dataset)}")

# ------------------------
# Helper function: evaluate model
# ------------------------
def evaluate_model(model, model_name, save_cm=True):
    """Evaluate a model with comprehensive metrics"""
    model.eval()
    model.to(device)
    all_preds, all_labels = [], []
    all_probs = []  # For ROC-AUC
    total_time = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            start_time = time.time()
            outputs = model(images)
            end_time = time.time()
            
            # Inference time
            total_time += (end_time - start_time)
            
            # Get predictions
            if isinstance(outputs, dict):  # Some models return dict
                outputs = outputs['logits'] if 'logits' in outputs else outputs['pred']
            
            # Get probabilities for ROC-AUC
            probs = torch.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy())
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    
    # ðŸ”¹ Calculate Metrics
    print(f"\n{'='*60}")
    print(f"ðŸ“Š {model_name} - Comprehensive Evaluation")
    print(f"{'='*60}")
    
    # 1. Accuracy - Correct prediction ratio
    acc = accuracy_score(all_labels, all_preds)
    print(f"\nðŸ”¹ Accuracy: {acc*100:.2f}%")
    print(f"   â†’ Measures overall correct prediction ratio")
    
    # 2. Precision, Recall, F1 - Reliability metrics
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    
    print(f"\nðŸ”¹ Precision: {precision:.4f}")
    print(f"   â†’ Measures reliability of positive predictions")
    print(f"ðŸ”¹ Recall: {recall:.4f}")
    print(f"   â†’ Measures ability to find all positive instances")
    print(f"ðŸ”¹ F1-Score: {f1:.4f}")
    print(f"   â†’ Harmonic mean of precision and recall")
    
    # 3. ROC-AUC - Overall discrimination power
    try:
        # One-vs-Rest ROC-AUC for multiclass
        from sklearn.preprocessing import label_binarize
        y_true_bin = label_binarize(all_labels, classes=range(num_classes))
        
        # Calculate ROC-AUC for each class
        roc_auc_per_class = {}
        for i in range(num_classes):
            if len(np.unique(all_labels)) > 1:  # Need at least 2 classes
                try:
                    roc_auc_per_class[i] = roc_auc_score(y_true_bin[:, i], all_probs[:, i])
                except:
                    roc_auc_per_class[i] = 0.0
        
        # Macro average ROC-AUC
        roc_auc_macro = np.mean(list(roc_auc_per_class.values()))
        
        print(f"\nðŸ”¹ ROC-AUC (Macro): {roc_auc_macro:.4f}")
        print(f"   â†’ Measures overall discrimination power across all classes")
        print(f"   â†’ Range: 0.5 (random) to 1.0 (perfect)")
        
    except Exception as e:
        print(f"\nðŸ”¹ ROC-AUC: Could not calculate ({e})")
        roc_auc_macro = 0.0
    
    # 4. Inference Speed
    avg_time = total_time / len(test_dataset)
    fps = 1 / avg_time
    print(f"\nðŸ”¹ Inference Speed:")
    print(f"   â†’ Avg Time: {avg_time*1000:.2f} ms/image")
    print(f"   â†’ FPS: {fps:.2f} frames/second")

    # 5. Confusion Matrix - Class-wise performance
    if save_cm:
        try:
            output_dir = Path(r"F:\skills-copilot-codespaces-vscode\thesis\checkpoints")
            cm = confusion_matrix(all_labels, all_preds, labels=range(num_classes))
            
            # Plot confusion matrix
            fig, ax = plt.subplots(figsize=(14, 12))
            disp = ConfusionMatrixDisplay(cm, display_labels=[f"C{i}" for i in range(num_classes)])
            disp.plot(cmap=plt.cm.Blues, ax=ax, values_format='d')
            plt.title(f"{model_name} Confusion Matrix\n(Shows class-wise performance)", fontsize=14, fontweight='bold')
            plt.xlabel("Predicted Label", fontsize=12)
            plt.ylabel("True Label", fontsize=12)
            plt.tight_layout()
            plt.savefig(output_dir / f"{model_name}_confusion_matrix.png", dpi=150)
            plt.close()
            
            print(f"\nðŸ”¹ Confusion Matrix: Saved to {model_name}_confusion_matrix.png")
            print(f"   â†’ Shows class-wise prediction performance")
            print(f"   â†’ Diagonal = correct predictions, off-diagonal = errors")
            
            # Per-class accuracy
            print(f"\n   Per-Class Accuracy:")
            for i in range(num_classes):
                class_correct = cm[i, i]
                class_total = cm[i, :].sum()
                class_acc = (class_correct / class_total * 100) if class_total > 0 else 0
                print(f"      Class {i} ({class_names[i][:15]:<15}): {class_acc:5.1f}% ({class_correct}/{class_total})")
                
        except Exception as e:
            print(f"\nðŸ”¹ Confusion Matrix: Could not generate ({e})")
    
    print(f"\n{'='*60}\n")
    
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc_macro,
        'inference_time': avg_time,
        'fps': fps
    }

# ------------------------
# CNN Definition (MUST match cnn.ipynb!)
# ------------------------
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=13):
        super(SimpleCNN, self).__init__()
        img_size = 224
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
        
        # Calculate flattened size after convolutions
        flattened_size = 512 * (img_size // 16) ** 2
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# ------------------------
# Load and Evaluate Models
# ------------------------
print("\n" + "="*60)
print("Model Evaluation on RSUD Dataset")
print("="*60)

results = {}

# Option 1: Evaluate YOLO Model
print("\n--- Evaluating YOLO Model ---")
try:
    from ultralytics import YOLO
    
    yolo_path = "F:/skills-copilot-codespaces-vscode/thesis/runs/detect/rsud20k_yolo114/weights/best.pt"
    if os.path.exists(yolo_path):
        print(f"âœ“ YOLO model found at: {yolo_path}")
        print("Note: Run YOLO validation separately for full metrics:")
        print(f"  yolo val model={yolo_path} data=F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k/images/data.yaml")
        
        results["YOLO"] = {
            "Accuracy": "See YOLO val",
            "F1-Score": "See YOLO val", 
            "Inference Time": "~30 FPS"
        }
    else:
        print(f"âš  YOLO model not found")
except Exception as e:
    print(f"âš  Could not load YOLO: {e}")

# Option 2: Evaluate CNN
print("\n--- Evaluating CNN Model ---")
cnn_checkpoint = "F:/skills-copilot-codespaces-vscode/thesis/checkpoints/cnn_best.pth"
if os.path.exists(cnn_checkpoint):
    try:
        cnn_model = SimpleCNN(num_classes)
        cnn_model.load_state_dict(torch.load(cnn_checkpoint, map_location=device))
        print("âœ“ CNN model loaded successfully")
        metrics = evaluate_model(cnn_model, "CNN")
        results["CNN"] = metrics
    except Exception as e:
        print(f"âš  Could not evaluate CNN: {e}")
else:
    print(f"âš  CNN checkpoint not found (train with cnn.ipynb first)")

# Option 3: Evaluate ViT
print("\n--- Evaluating ViT Model ---")
vit_checkpoint = "F:/skills-copilot-codespaces-vscode/thesis/checkpoints/vit_best.pth"
if os.path.exists(vit_checkpoint):
    try:
        from timm import create_model
        vit_model = create_model('vit_base_patch16_224', pretrained=False, num_classes=num_classes)
        vit_model.load_state_dict(torch.load(vit_checkpoint, map_location=device))
        print("âœ“ ViT model loaded successfully")
        metrics = evaluate_model(vit_model, "ViT")
        results["ViT"] = metrics
    except Exception as e:
        print(f"âš  Could not evaluate ViT: {e}")
else:
    print(f"âš  ViT checkpoint not found")

# Option 4: Evaluate DINOv2
print("\n--- Evaluating DINOv2 Model ---")
dino_checkpoint = "F:/skills-copilot-codespaces-vscode/thesis/checkpoints/dinov2_best.pth"
if os.path.exists(dino_checkpoint):
    try:
        from timm import create_model
        # FIXED: Use correct architecture (patch16_224.dino not patch14_dinov2)
        dinov2_model = create_model('vit_base_patch16_224.dino', pretrained=False, num_classes=num_classes)
        dinov2_model.load_state_dict(torch.load(dino_checkpoint, map_location=device))
        print("âœ“ DINOv2 model loaded successfully")
        metrics = evaluate_model(dinov2_model, "DINOv2")
        results["DINOv2"] = metrics
    except Exception as e:
        print(f"âš  Could not evaluate DINOv2: {e}")
else:
    print(f"âš  DINOv2 checkpoint not found (train with dino.ipynb first)")

# ------------------------
# Print Summary Table
# ------------------------
print("\n" + "="*80)
print("ðŸ“Š MODEL COMPARISON SUMMARY")
print("="*80)
print("\nðŸ”¹ Metrics Explained:")
print("  â€¢ Accuracy      : Correct prediction ratio (higher is better)")
print("  â€¢ Precision     : Reliability of positive predictions")
print("  â€¢ Recall        : Ability to find all positive instances")
print("  â€¢ F1-Score      : Harmonic mean of precision and recall")
print("  â€¢ ROC-AUC       : Overall discrimination power (0.5=random, 1.0=perfect)")
print("  â€¢ Confusion Matrix : Class-wise performance visualization")
print("="*80)

if results:
    # Filter out YOLO for detailed comparison
    trained_models = {k: v for k, v in results.items() if k != "YOLO" and isinstance(v, dict) and 'accuracy' in v}
    
    if trained_models:
        print(f"\n{'Model':<12}{'Accuracy':<12}{'Precision':<12}{'Recall':<12}{'F1-Score':<12}{'ROC-AUC':<12}{'FPS':<10}")
        print("-" * 82)
        
        for model_name, metrics in trained_models.items():
            acc_str = f"{metrics['accuracy']*100:.2f}%"
            prec_str = f"{metrics['precision']:.4f}"
            rec_str = f"{metrics['recall']:.4f}"
            f1_str = f"{metrics['f1']:.4f}"
            roc_str = f"{metrics['roc_auc']:.4f}"
            fps_str = f"{metrics['fps']:.1f}"
            
            print(f"{model_name:<12}{acc_str:<12}{prec_str:<12}{rec_str:<12}{f1_str:<12}{roc_str:<12}{fps_str:<10}")
        
        print("\n" + "="*80)
        print("ðŸ“ˆ ANALYSIS")
        print("="*80)
        
        # Best model by accuracy
        best_acc_model = max(trained_models.items(), key=lambda x: x[1]['accuracy'])
        print(f"\nâœ“ Best Accuracy: {best_acc_model[0]} ({best_acc_model[1]['accuracy']*100:.2f}%)")
        
        # Best F1-Score
        best_f1_model = max(trained_models.items(), key=lambda x: x[1]['f1'])
        print(f"âœ“ Best F1-Score: {best_f1_model[0]} ({best_f1_model[1]['f1']:.4f})")
        
        # Best ROC-AUC
        best_roc_model = max(trained_models.items(), key=lambda x: x[1]['roc_auc'])
        print(f"âœ“ Best ROC-AUC: {best_roc_model[0]} ({best_roc_model[1]['roc_auc']:.4f})")
        
        # Fastest model
        fastest_model = max(trained_models.items(), key=lambda x: x[1]['fps'])
        print(f"âœ“ Fastest Model: {fastest_model[0]} ({fastest_model[1]['fps']:.1f} FPS)")
        
        # Balanced model (F1 * FPS score)
        balanced_scores = {k: v['f1'] * (v['fps'] / 100) for k, v in trained_models.items()}
        best_balanced = max(balanced_scores.items(), key=lambda x: x[1])
        print(f"âœ“ Best Balanced (F1Ã—Speed): {best_balanced[0]}")
        
    # Show YOLO info separately
    if "YOLO" in results:
        print("\n" + "-"*80)
        print("ðŸ“¦ YOLO Model (Object Detection)")
        print("-"*80)
        print("  Note: YOLO is an object detection model, not classification")
        print("  Run separate validation for complete metrics:")
        print("    yolo val model=F:/skills-copilot-codespaces-vscode/thesis/runs/detect/rsud20k_yolo114/weights/best.pt \\")
        print("             data=F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k/images/data.yaml")
        
else:
    print("\nâš  No models were evaluated")
    print("\nTo evaluate models, train them first:")
    print("  â€¢ CNN: Run cnn.ipynb")
    print("  â€¢ ViT: Run vit.ipynb")
    print("  â€¢ DINOv2: Run dino.ipynb")
    print("  â€¢ YOLO: Use trained model at runs/detect/rsud20k_yolo114/weights/best.pt")

print("\n" + "="*80)
print("âœ“ Evaluation complete! Results saved to checkpoints/")
print("  â€¢ Confusion matrices: PNG files for visual analysis")
print("  â€¢ All metrics calculated: Accuracy, Precision, Recall, F1, ROC-AUC")
print("="*80)

Using device: cuda
GPU: NVIDIA GeForce RTX 3060
Found 1004 test images
Number of classes: 13
Test samples: 1004

Model Evaluation on RSUD Dataset

--- Evaluating YOLO Model ---
âœ“ YOLO model found at: F:/skills-copilot-codespaces-vscode/thesis/runs/detect/rsud20k_yolo114/weights/best.pt
Note: Run YOLO validation separately for full metrics:
  yolo val model=F:/skills-copilot-codespaces-vscode/thesis/runs/detect/rsud20k_yolo114/weights/best.pt data=F:/skills-copilot-codespaces-vscode/thesis/rsuddataset/rsud20k/images/data.yaml

--- Evaluating CNN Model ---
âœ“ CNN model loaded successfully
âœ“ CNN model loaded successfully

ðŸ“Š CNN - Comprehensive Evaluation

ðŸ”¹ Accuracy: 16.24%
   â†’ Measures overall correct prediction ratio

ðŸ”¹ Precision: 0.1910
   â†’ Measures reliability of positive predictions
ðŸ”¹ Recall: 0.1624
   â†’ Measures ability to find all positive instances
ðŸ”¹ F1-Score: 0.1379
   â†’ Harmonic mean of precision and recall

ðŸ”¹ ROC-AUC (Macro): 0.5300
   â†’ Measu