## Import Libraries

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

## Setup Device and Data Transforms

In [None]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if device.type == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')

In [None]:
# Data transforms for testing
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

## Load Test Dataset

In [None]:
# Load test dataset
test_dir = 'Rust_Dataset/test'
test_dataset = ImageFolder(test_dir, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Get class names
class_names = test_dataset.classes
num_classes = len(class_names)

print(f'Test Dataset Size: {len(test_dataset)}')
print(f'Number of Classes: {num_classes}')
print(f'Class Names: {class_names}')

## Define Model Loading Function

In [None]:
def load_model(model_name, model_path):
    """Load a pre-trained model and its weights"""
    if model_name == 'resnet50':
        model = models.resnet50(pretrained=False)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'densenet121':
        model = models.densenet121(pretrained=False)
        model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    elif model_name == 'efficientnet_b0':
        model = models.efficientnet_b0(pretrained=False)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    elif model_name == 'mobilenet_v2':
        model = models.mobilenet_v2(pretrained=False)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    elif model_name == 'mobilenet_v3_large':
        model = models.mobilenet_v3_large(pretrained=False)
        model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
    elif model_name == 'mobilenet_v3_small':
        model = models.mobilenet_v3_small(pretrained=False)
        model.classifier[3] = nn.Linear(model.classifier[3].in_features, num_classes)
    else:
        raise ValueError(f'Unknown model: {model_name}')
    
    # Load weights
    if os.path.exists(model_path):
        model.load_state_dict(torch.load(model_path, map_location=device))
        print(f'Loaded weights from {model_path}')
    else:
        print(f'Warning: Model weights not found at {model_path}')
        return None
    
    model = model.to(device)
    model.eval()
    return model

## Define Evaluation Function

In [None]:
def evaluate_model(model, data_loader):
    """Evaluate model on test dataset"""
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc='Testing'):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    conf_matrix = confusion_matrix(all_labels, all_preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix,
        'predictions': all_preds,
        'labels': all_labels
    }

## Define Visualization Functions

In [None]:
def plot_confusion_matrix(conf_matrix, model_name):
    """Plot confusion matrix"""
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

## Test All Models

In [None]:
# Define all models to test
models_to_test = [
    ('ResNet50', 'resnet50', 'rust_resnet50.pth'),
    ('DenseNet121', 'densenet121', 'rust_densenet121.pth'),
    ('EfficientNetB0', 'efficientnet_b0', 'rust_efficientnetb0.pth'),
    ('MobileNetV2', 'mobilenet_v2', 'rust_mobilenetv2.pth'),
    ('MobileNetV3-Large', 'mobilenet_v3_large', 'mobile_net_versions/rust_mobilenetv3_large.pth'),
    ('MobileNetV3-Small', 'mobilenet_v3_small', 'mobile_net_versions/rust_mobilenetv3_small.pth')
]

# Store results
results = {}

In [None]:
# Test each model
for display_name, model_name, model_path in models_to_test:
    print(f'\n{"="*60}')
    print(f'Testing {display_name}')
    print(f'{"="*60}')
    
    # Load model
    model = load_model(model_name, model_path)
    
    if model is None:
        print(f'Skipping {display_name} due to loading error')
        continue
    
    # Evaluate model
    metrics = evaluate_model(model, test_loader)
    results[display_name] = metrics
    
    # Print metrics
    print(f'\nResults for {display_name}:')
    print(f'Accuracy:  {metrics["accuracy"]*100:.2f}%')
    print(f'Precision: {metrics["precision"]*100:.2f}%')
    print(f'Recall:    {metrics["recall"]*100:.2f}%')
    print(f'F1-Score:  {metrics["f1"]*100:.2f}%')
    
    # Plot confusion matrix
    plot_confusion_matrix(metrics['confusion_matrix'], display_name)
    
    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

## Compare All Models

In [None]:
# Create comparison table
import pandas as pd

comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append({
        'Model': model_name,
        'Accuracy (%)': f"{metrics['accuracy']*100:.2f}",
        'Precision (%)': f"{metrics['precision']*100:.2f}",
        'Recall (%)': f"{metrics['recall']*100:.2f}",
        'F1-Score (%)': f"{metrics['f1']*100:.2f}"
    })

comparison_df = pd.DataFrame(comparison_data)
print('\n' + '='*80)
print('MODEL COMPARISON SUMMARY')
print('='*80)
print(comparison_df.to_string(index=False))
print('='*80)

## Visualize Model Comparison

In [None]:
# Plot comparison chart
fig, ax = plt.subplots(figsize=(12, 6))

model_names = list(results.keys())
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(model_names))
width = 0.2

for i, metric in enumerate(metrics_to_plot):
    values = [results[model][metric]*100 for model in model_names]
    ax.bar(x + i*width, values, width, label=metric.capitalize())

ax.set_xlabel('Models')
ax.set_ylabel('Score (%)')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Per-Class Performance Analysis

In [None]:
# Analyze per-class performance for each model
for model_name, metrics in results.items():
    print(f'\n{"="*60}')
    print(f'Per-Class Metrics for {model_name}')
    print(f'{"="*60}')
    
    precision, recall, f1, support = precision_recall_fscore_support(
        metrics['labels'], 
        metrics['predictions'],
        labels=range(num_classes)
    )
    
    class_metrics = pd.DataFrame({
        'Class': class_names,
        'Precision (%)': [f"{p*100:.2f}" for p in precision],
        'Recall (%)': [f"{r*100:.2f}" for r in recall],
        'F1-Score (%)': [f"{f*100:.2f}" for f in f1],
        'Support': support
    })
    
    print(class_metrics.to_string(index=False))
    print()

## Identify Best Model

In [None]:
# Find best model based on different metrics
best_accuracy = max(results.items(), key=lambda x: x[1]['accuracy'])
best_precision = max(results.items(), key=lambda x: x[1]['precision'])
best_recall = max(results.items(), key=lambda x: x[1]['recall'])
best_f1 = max(results.items(), key=lambda x: x[1]['f1'])

print('\n' + '='*60)
print('BEST PERFORMING MODELS')
print('='*60)
print(f'Best Accuracy:  {best_accuracy[0]} ({best_accuracy[1]["accuracy"]*100:.2f}%)')
print(f'Best Precision: {best_precision[0]} ({best_precision[1]["precision"]*100:.2f}%)')
print(f'Best Recall:    {best_recall[0]} ({best_recall[1]["recall"]*100:.2f}%)')
print(f'Best F1-Score:  {best_f1[0]} ({best_f1[1]["f1"]*100:.2f}%)')
print('='*60)