## Assignment Description

Please upload the following materials to the GitHub repository of your project work and submit the URL again:

- Python code that downloads, prepares and loads the data (this was the task of Milestone 1, now you have only to adjust it to the other parts of your code)
- Python code for the baseline model.
- Python code that trains a deep learning model,
- Python code that evaluates the results on a (separate) test set,
- Updated README.MD file with instructions how to run the code.

Please add as much comments to your code as much is needed to be able to easily understand it.

At this stage, it is not required to have good (or even reasonable) results, the only requirement is to have the data loading-preparation-training-evaluation pipeline ready.

## Our Model Architecture

### 1. Transfer Learning:
- Load pre-trained ResNet-50 or -101 model with FPN (better suited for drone images)
- Freeze or allow fine-tuning
- Remove original detection layers (head)

### 2. Region Proposal Network (RPN):
- Use built-in RPN layers to generate regions
- Anchor box tuning as needed (e.g. smaller sizes)

### 3. Region of Interest (ROI) Head:
- Customize ROI detection box head layers with dropout regularization
- Add final box predictor, specified for VisDrone data

### 4. Misc. Training Optimizations:
- Data augmentation
- Optimizers (SGD with momentum, L2 regularization)
- Learning rate scheduling
- Early stopping

In [1]:
# Import necessary libraries
#
# PyTorch and related libraries for deep learning
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Computer vision and image processing libraries
import torchvision
import torchvision.transforms.functional as F
import numpy as np
import cv2

# Model-specific imports for Faster R-CNN
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone

# Visualization and utility libraries
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from collections import defaultdict
import time
import random

import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices("GPU")))

# Set device (GPU if available, otherwise CPU)
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(f"Using device: {device}")


Num GPUs Available: 1


In [2]:
# Data Loading and Preparation (adapted from MS1)
"""
This section reuses code from Milestone 1 to download, prepare, and load the VisDrone dataset.
The dataset consists of drone images with object annotations for 11 classes.
"""

#--------------------------
# Dataset Directory Setup
#--------------------------
# Define paths for dataset
dataset_dir = "/content/VisDrone"
os.makedirs(dataset_dir, exist_ok=True)

#--------------------------
# Download Functions
#--------------------------
# Function to download and extract training data
def download_train_data():
    """
    Downloads and extracts the VisDrone training dataset.
    Uses gdown to fetch from Google Drive and unzip to the dataset directory.
    """
    file_id = '1a2oHjcEcwXP8oUF95qiwrqzACb2YlUhn'
    output = f'{dataset_dir}/VisDrone2019-DET-train.zip'

    !gdown --id {file_id} -O {output}
    !unzip -q {output} -d {dataset_dir}

# Function to download and extract validation data
def download_val_data():
    """
    Downloads and extracts the VisDrone validation dataset.
    Uses gdown to fetch from Google Drive and unzip to the dataset directory.
    """
    file_id = '1bxK5zgLn0_L8x276eKkuYA_FzwCIjb59'
    output = f'{dataset_dir}/VisDrone2019-DET-val.zip'

    !gdown --id {file_id} -O {output}
    !unzip -q {output} -d {dataset_dir}

# Function to download and extract test data
def download_test_data():
    """
    Downloads and extracts the VisDrone test dataset.
    Uses gdown to fetch from Google Drive and unzip to the dataset directory.
    """
    subfolder = "VisDrone2019-DET-test-dev"
    output_zip = f"{dataset_dir}/{subfolder}.zip"
    extract_path = f"{dataset_dir}/{subfolder}"
    os.makedirs(extract_path, exist_ok=True)

    file_id = '1PFdW_VFSCfZ_sTSZAGjQdifF_Xd5mf0V'

    !gdown --id {file_id} -O {output_zip}
    !unzip -q {output_zip} -d {extract_path}

# Uncomment to download data
# download_train_data()
# download_val_data()
# download_test_data()

#--------------------------
# Data Augmentation Classes
#--------------------------
# These classes implement data augmentation techniques for object detection
# Each class follows the same interface: __init__ for parameters and __call__ for transformation

# Compose class to apply multiple transformations sequentially
class Compose:
    """
    Composes several transforms together.

    Args:
        transforms (list): List of transforms to compose.
    """
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target):
        for t in self.transforms:
            image, target = t(image, target)
        return image, target

# Random horizontal flip augmentation
class RandomHorizontalFlip:
    """
    Randomly flips the image and its bounding boxes horizontally.

    Args:
        prob (float): Probability of flipping the image.
    """
    def __init__(self, prob=0.5):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            height, width = image.shape[-2:]
            image = image.flip(-1)

            # Flip boxes: x_min becomes (width - x_max) and x_max becomes (width - x_min)
            boxes = target["boxes"]
            boxes[:, [0, 2]] = width - boxes[:, [2, 0]]
            target["boxes"] = boxes

        return image, target

# Random brightness adjustment augmentation
class RandomBrightness:
    """
    Randomly adjusts the brightness of the image.

    Args:
        brightness (float): Maximum brightness adjustment factor.
    """
    def __init__(self, brightness=0.2):
        self.brightness = brightness

    def __call__(self, image, target):
        if random.random() < 0.5:
            # Calculate a random brightness factor within the specified range
            brightness_factor = random.uniform(max(0, 1 - self.brightness), 1 + self.brightness)
            image = torchvision.transforms.functional.adjust_brightness(image, brightness_factor)
        return image, target

# Random contrast adjustment augmentation
class RandomContrast:
    """
    Randomly adjusts the contrast of the image.

    Args:
        contrast (float): Maximum contrast adjustment factor.
    """
    def __init__(self, contrast=0.2):
        self.contrast = contrast

    def __call__(self, image, target):
        if random.random() < 0.5:
            # Calculate a random contrast factor within the specified range
            contrast_factor = random.uniform(max(0, 1 - self.contrast), 1 + self.contrast)
            image = torchvision.transforms.functional.adjust_contrast(image, contrast_factor)
        return image, target

#--------------------------
# VisDrone Dataset Class
#--------------------------
# Custom PyTorch Dataset class for the VisDrone dataset
class VisDroneDataset(Dataset):
    """
    VisDrone dataset class for object detection.

    This class handles loading images and annotations, resizing, 
    converting to tensors, and applying data augmentation.
    """
    def __init__(self, images_np, annotations_np, resize_to=(640, 640), transforms=None, device='cpu', augment=False):
        """
        Initialize the VisDrone dataset.

        Args:
            images_np (list): List of numpy arrays containing images
            annotations_np (list): List of dictionaries containing annotations (boxes and labels)
            resize_to (tuple): Tuple of (height, width) to resize images to
            transforms (callable, optional): Optional transforms to apply to images and annotations
            device (str): Device to load tensors to ('cpu' or 'cuda')
            augment (bool): Whether to apply data augmentation (only for training)
        """
        self.images = images_np
        self.annotations = annotations_np
        self.resize_to = resize_to
        self.transforms = transforms
        self.device = device
        self.augment = augment

        # Set up data augmentation pipeline if augmentation is enabled
        if self.augment:
            self.data_augmentation = Compose([
                RandomHorizontalFlip(0.5),
                RandomBrightness(0.2),
                RandomContrast(0.2)
            ])
        else:
            self.data_augmentation = None

    def __len__(self):
        """Return the number of images in the dataset."""
        return len(self.images)

    def __getitem__(self, idx):
        """
        Get a sample from the dataset.

        Args:
            idx (int): Index of the sample to get

        Returns:
            tuple: (image, target) where target is a dictionary containing boxes and labels
        """
        # Get the image and annotation at the specified index
        image = self.images[idx]
        ann = self.annotations[idx]

        # Get original image dimensions
        orig_h, orig_w = image.shape[:2]

        #--------------------------
        # Image Preprocessing
        #--------------------------
        # Resize image to the target size
        resized_image = cv2.resize(image, (self.resize_to[1], self.resize_to[0]))

        # Convert to PyTorch tensor and normalize to [0, 1]
        # Change from HWC to CHW format (height, width, channels) -> (channels, height, width)
        resized_image = torch.from_numpy(resized_image).permute(2, 0, 1).float() / 255.0

        #--------------------------
        # Bounding Box Preprocessing
        #--------------------------
        # Calculate scaling factors for bounding boxes
        scale_x = self.resize_to[1] / orig_w
        scale_y = self.resize_to[0] / orig_h

        # Scale bounding boxes to match the resized image
        boxes = ann['boxes'].copy()
        boxes[:, [0, 2]] = boxes[:, [0, 2]] * scale_x  # Scale x coordinates
        boxes[:, [1, 3]] = boxes[:, [1, 3]] * scale_y  # Scale y coordinates

        # Get class labels
        labels = ann['labels'].copy()

        #--------------------------
        # Convert to PyTorch Tensors
        #--------------------------
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        # Create target dictionary with boxes, labels, and image_id
        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([idx])
        }

        #--------------------------
        # Apply Transformations
        #--------------------------
        # Apply data augmentation if specified
        if self.augment and self.data_augmentation:
            resized_image, target = self.data_augmentation(resized_image, target)

        # Apply additional transforms if specified
        if self.transforms:
            resized_image, target = self.transforms(resized_image, target)

        # Move tensors to the specified device
        return resized_image.to(self.device), target

#--------------------------
# Data Loading and Processing
#--------------------------
def load_and_process_data(dataset_dir, subset, max_samples=None, include_negative=False):
    """
    Load and process VisDrone data from disk.

    This function reads images and their corresponding annotation files,
    processes the annotations into bounding boxes and class labels,
    and optionally includes images with no valid annotations.

    Args:
        dataset_dir (str): Directory containing the dataset
        subset (str): 'train', 'val', or 'test'
        max_samples (int, optional): Maximum number of samples to load (for testing)
        include_negative (bool): Whether to include images with no valid annotations (negative examples)

    Returns:
        tuple: (images_np, annotations_np) where:
            - images_np is a list of numpy arrays containing images
            - annotations_np is a list of dictionaries containing annotations
    """
    #--------------------------
    # Set up paths based on subset
    #--------------------------
    if subset == 'train':
        data_path = os.path.join(dataset_dir, "VisDrone2019-DET-train")
    elif subset == 'val':
        data_path = os.path.join(dataset_dir, "VisDrone2019-DET-val")
    elif subset == 'test':
        data_path = os.path.join(dataset_dir, "VisDrone2019-DET-test-dev")
    else:
        raise ValueError("subset must be 'train', 'val', or 'test'")

    # Define paths to images and annotations
    image_dir = os.path.join(data_path, "images")
    ann_dir = os.path.join(data_path, "annotations")

    #--------------------------
    # Get list of image files
    #--------------------------
    image_files = sorted(os.listdir(image_dir))
    if max_samples:
        # Limit the number of samples if specified
        image_files = image_files[:max_samples]

    #--------------------------
    # Process images and annotations
    #--------------------------
    images_np = []
    annotations_np = []
    negative_count = 0

    for img_file in image_files:
        # Load image
        img_path = os.path.join(image_dir, img_file)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB

        # Load corresponding annotation file
        ann_file = img_file.replace('.jpg', '.txt')
        ann_path = os.path.join(ann_dir, ann_file)

        # Parse annotation file
        boxes = []
        classes = []

        with open(ann_path, 'r') as f:
            for line in f:
                # VisDrone annotation format: <x, y, width, height, score, class, truncation, occlusion>
                x, y, w, h, score, obj_class, trunc, occ = map(int, line.strip().split(','))

                # Skip ignored regions (class 0)
                if obj_class != 0:
                    # Convert from [x, y, width, height] to [x1, y1, x2, y2] format
                    boxes.append([x, y, x + w, y + h])
                    classes.append(obj_class)

        #--------------------------
        # Handle negative examples
        #--------------------------
        # Include images with no valid annotations if specified
        if len(boxes) > 0 or include_negative:
            if len(boxes) == 0:
                # This is a negative example (image with no valid annotations)
                negative_count += 1
                # For negative examples, we still need to provide empty arrays
                boxes_array = np.zeros((0, 4), dtype=np.float32)
                classes_array = np.zeros((0,), dtype=np.int32)
            else:
                # Convert lists to numpy arrays
                boxes_array = np.array(boxes, dtype=np.float32)
                classes_array = np.array(classes, dtype=np.int32)

            # Add image and annotations to the lists
            images_np.append(img)
            annotations_np.append({
                'boxes': boxes_array,
                'labels': classes_array
            })

    #--------------------------
    # Print summary
    #--------------------------
    if include_negative:
        print(f"Loaded {len(images_np)} {subset} images ({negative_count} negative examples)")
    else:
        print(f"Loaded {len(images_np)} {subset} images with valid annotations")

    return images_np, annotations_np

#--------------------------
# DataLoader Creation
#--------------------------
def create_dataloaders(images_np_train, annotations_np_train, 
                       images_np_val, annotations_np_val, 
                       images_np_test, annotations_np_test,
                       batch_size=4, augment_train=True):
    """
    Create PyTorch dataloaders for train, validation, and test sets.

    This function creates dataset objects for each split and wraps them in DataLoader
    objects that handle batching, shuffling, and collation.

    Args:
        images_np_train (list): List of training images as numpy arrays
        annotations_np_train (list): List of training annotations as dictionaries
        images_np_val (list): List of validation images as numpy arrays
        annotations_np_val (list): List of validation annotations as dictionaries
        images_np_test (list): List of test images as numpy arrays
        annotations_np_test (list): List of test annotations as dictionaries
        batch_size (int): Batch size for dataloaders
        augment_train (bool): Whether to apply data augmentation to the training set

    Returns:
        tuple: (train_loader, valid_loader, test_loader) - DataLoader objects for each split
    """
    #--------------------------
    # Create Dataset Objects
    #--------------------------
    # Training dataset with optional data augmentation
    train_dataset = VisDroneDataset(
        images_np_train, 
        annotations_np_train, 
        transforms=None,  # No additional transforms beyond augmentation
        augment=augment_train  # Apply data augmentation only to training set
    )

    # Validation dataset (no augmentation)
    val_dataset = VisDroneDataset(
        images_np_val, 
        annotations_np_val, 
        transforms=None, 
        augment=False
    )

    # Test dataset (no augmentation)
    test_dataset = VisDroneDataset(
        images_np_test, 
        annotations_np_test, 
        transforms=None, 
        augment=False
    )

    #--------------------------
    # Create DataLoader Objects
    #--------------------------
    # Training dataloader with shuffling
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True,  # Shuffle training data for better generalization
        collate_fn=lambda x: tuple(zip(*x))  # Custom collate function for object detection
    )

    # Validation dataloader (no shuffling)
    valid_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        collate_fn=lambda x: tuple(zip(*x))
    )

    # Test dataloader (no shuffling)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        collate_fn=lambda x: tuple(zip(*x))
    )

    return train_loader, valid_loader, test_loader

# For demonstration, load a small subset of data
# In practice, you would load the full dataset
# images_np_train, annotations_np_train = load_and_process_data(dataset_dir, 'train', max_samples=100)
# images_np_val, annotations_np_val = load_and_process_data(dataset_dir, 'val', max_samples=50)
# images_np_test, annotations_np_test = load_and_process_data(dataset_dir, 'test', max_samples=50)

# train_loader, valid_loader, test_loader = create_dataloaders(
#     images_np_train, annotations_np_train,
#     images_np_val, annotations_np_val,
#     images_np_test, annotations_np_test,
#     batch_size=4
# )


OSError: [Errno 30] Read-only file system: '/content'

In [None]:
#--------------------------
# Baseline Model Implementation
#--------------------------
"""
This section implements a simple baseline model for object detection.
We use a pre-trained Faster R-CNN with ResNet-50 backbone from torchvision.
"""

def get_baseline_model(num_classes):
    """
    Create a baseline Faster R-CNN model with ResNet-50 backbone.

    This function loads a pre-trained Faster R-CNN model and adapts it for our
    specific number of classes by replacing the classifier head.

    Args:
        num_classes (int): Number of classes to predict (including background)
                          For VisDrone, this is 12 (11 object classes + background)

    Returns:
        torch.nn.Module: Baseline Faster R-CNN model ready for fine-tuning
    """
    #--------------------------
    # Load Pre-trained Model
    #--------------------------
    # Load pre-trained Faster R-CNN model with ResNet-50 backbone and Feature Pyramid Network (FPN)
    # The model is pre-trained on COCO dataset which has similar object categories
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    #--------------------------
    # Replace Classifier Head
    #--------------------------
    # Get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # Replace the classifier with a new one for our specific number of classes
    # FastRCNNPredictor handles both classification and bounding box regression
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model


In [None]:
#--------------------------
# Custom Deep Learning Model Implementation
#--------------------------
"""
This section implements our custom deep learning model based on the architecture description.
We use a pre-trained ResNet-50 or ResNet-101 with FPN as the backbone,
customize the RPN and ROI head, and add training optimizations.
"""

def get_custom_model(num_classes, backbone_name='resnet50', trainable_backbone_layers=3):
    """
    Create a custom Faster R-CNN model with ResNet+FPN backbone.

    This function builds a custom object detection model with several improvements:
    1. Transfer Learning: Uses pre-trained ResNet with FPN backbone
    2. Region Proposal Network: Customized anchor sizes for small objects in drone imagery
    3. ROI Head: Added dropout regularization to prevent overfitting

    Args:
        num_classes (int): Number of classes to predict (including background)
        backbone_name (str): Name of the backbone ('resnet50' or 'resnet101')
        trainable_backbone_layers (int): Number of backbone layers to train (others will be frozen)

    Returns:
        torch.nn.Module: Custom Faster R-CNN model
    """
    #--------------------------
    # 1. Transfer Learning: Backbone
    #--------------------------
    # Load pre-trained ResNet with Feature Pyramid Network (FPN)
    # FPN is particularly useful for detecting objects at different scales
    backbone = resnet_fpn_backbone(
        backbone_name=backbone_name,  # ResNet-50 or ResNet-101
        pretrained=True,              # Use pre-trained weights from ImageNet
        trainable_layers=trainable_backbone_layers  # Number of layers to fine-tune
    )

    #--------------------------
    # 2. Region Proposal Network (RPN)
    #--------------------------
    # Customize anchor sizes for drone imagery
    # Drone images often contain small objects viewed from above, so we use smaller anchor sizes
    anchor_generator = AnchorGenerator(
        sizes=((16, 32, 64, 128, 256),),  # Smaller anchor sizes than default
        aspect_ratios=((0.5, 1.0, 2.0),)  # Standard aspect ratios
    )

    #--------------------------
    # Create the Faster R-CNN model
    #--------------------------
    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        # Increase RPN proposals for better small object detection
        rpn_pre_nms_top_n_train=2000,   # Number of proposals before NMS during training
        rpn_pre_nms_top_n_test=1000,    # Number of proposals before NMS during testing
        rpn_post_nms_top_n_train=1000,  # Number of proposals after NMS during training
        rpn_post_nms_top_n_test=500,    # Number of proposals after NMS during testing
    )

    #--------------------------
    # 3. ROI Head Customization
    #--------------------------
    # Replace the default box predictor with a custom one that includes dropout regularization
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # Define a custom box predictor class with dropout
    class CustomFastRCNNPredictor(nn.Module):
        """
        Custom Fast R-CNN predictor with dropout regularization.

        This predictor adds dropout before the final classification and regression layers
        to reduce overfitting, which is important for the limited drone dataset.
        """
        def __init__(self, in_channels, num_classes, dropout_prob=0.3):
            super(CustomFastRCNNPredictor, self).__init__()
            self.dropout = nn.Dropout(p=dropout_prob)  # Dropout layer
            self.cls_score = nn.Linear(in_channels, num_classes)  # Classification layer
            self.bbox_pred = nn.Linear(in_channels, num_classes * 4)  # Bounding box regression layer

        def forward(self, x):
            """Forward pass through the predictor."""
            x = self.dropout(x)  # Apply dropout
            cls_scores = self.cls_score(x)  # Predict class scores
            bbox_preds = self.bbox_pred(x)  # Predict bounding box coordinates
            return cls_scores, bbox_preds

    # Replace the default box predictor with our custom one
    model.roi_heads.box_predictor = CustomFastRCNNPredictor(in_features, num_classes)

    return model


In [None]:
#--------------------------
# Training Function
#--------------------------
"""
This section implements the training function for our models.
It includes training optimizations like SGD with momentum, L2 regularization,
learning rate scheduling, and early stopping.
"""

def train_model(model, train_loader, valid_loader, num_epochs=10, lr=0.005, device=device):
    """
    Train the object detection model with optimizations.

    This function handles the complete training process including:
    - Moving the model to the specified device
    - Setting up optimizer with momentum and L2 regularization
    - Implementing learning rate scheduling
    - Training for the specified number of epochs
    - Validating after each epoch
    - Implementing early stopping
    - Saving and loading the best model

    Args:
        model (torch.nn.Module): Model to train
        train_loader (DataLoader): DataLoader for training data
        valid_loader (DataLoader): DataLoader for validation data
        num_epochs (int): Number of epochs to train for
        lr (float): Initial learning rate
        device (torch.device): Device to train on ('cpu' or 'cuda')

    Returns:
        tuple: (model, train_loss_history, val_loss_history)
            - model: Trained model (best version based on validation loss)
            - train_loss_history: List of training losses per epoch
            - val_loss_history: List of validation losses per epoch
    """
    #--------------------------
    # Setup and Initialization
    #--------------------------
    # Move model to the specified device (GPU or CPU)
    model.to(device)

    # Set up optimizer: SGD with momentum and L2 regularization
    # Only optimize parameters that require gradients
    optimizer = optim.SGD(
        [p for p in model.parameters() if p.requires_grad], 
        lr=lr,                # Initial learning rate
        momentum=0.9,         # Momentum factor for SGD
        weight_decay=0.0005   # L2 regularization to prevent overfitting
    )

    # Set up learning rate scheduler to reduce LR when training plateaus
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer, 
        step_size=3,  # Decrease learning rate every 3 epochs
        gamma=0.1     # Multiply learning rate by 0.1 (reduce by 90%)
    )

    # Early stopping parameters to prevent overfitting
    patience = 3                    # Number of epochs to wait for improvement
    best_val_loss = float('inf')    # Initialize best validation loss
    early_stop_counter = 0          # Counter for early stopping

    # Initialize lists to track training and validation loss history
    train_loss_history = []
    val_loss_history = []

    #--------------------------
    # Training Loop
    #--------------------------
    for epoch in range(num_epochs):
        #--------------------------
        # Training Phase
        #--------------------------
        model.train()  # Set model to training mode
        epoch_loss = 0
        start_time = time.time()

        # Iterate through batches in the training dataloader
        for images, targets in train_loader:
            # Move data to the specified device
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Zero gradients before forward pass
            optimizer.zero_grad()

            # Forward pass - Faster R-CNN returns a dict of losses when targets are provided
            loss_dict = model(images, targets)

            # Sum all losses (classification, regression, etc.)
            losses = sum(loss for loss in loss_dict.values())

            # Backward pass and optimization
            losses.backward()
            optimizer.step()

            # Accumulate batch loss
            epoch_loss += losses.item()

        # Calculate average training loss for the epoch
        avg_train_loss = epoch_loss / len(train_loader)
        train_loss_history.append(avg_train_loss)

        #--------------------------
        # Validation Phase
        #--------------------------
        model.eval()  # Set model to evaluation mode
        val_loss = 0

        # No gradient calculation needed for validation
        with torch.no_grad():
            for images, targets in valid_loader:
                # Move data to the specified device
                images = [image.to(device) for image in images]
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

                # Forward pass
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())
                val_loss += losses.item()

        # Calculate average validation loss for the epoch
        avg_val_loss = val_loss / len(valid_loader)
        val_loss_history.append(avg_val_loss)

        # Update learning rate according to the scheduler
        lr_scheduler.step()

        #--------------------------
        # Epoch Summary
        #--------------------------
        # Calculate and print epoch statistics
        time_elapsed = time.time() - start_time
        print(f'Epoch {epoch+1}/{num_epochs}, '
              f'Train Loss: {avg_train_loss:.4f}, '
              f'Val Loss: {avg_val_loss:.4f}, '
              f'Time: {time_elapsed:.2f}s')

        #--------------------------
        # Early Stopping Check
        #--------------------------
        if avg_val_loss < best_val_loss:
            # If validation loss improved, save the model
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            early_stop_counter = 0  # Reset counter
        else:
            # If validation loss didn't improve, increment counter
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break  # Stop training if no improvement for 'patience' epochs

    #--------------------------
    # Finalization
    #--------------------------
    # Load the best model (lowest validation loss)
    model.load_state_dict(torch.load('best_model.pth'))

    return model, train_loss_history, val_loss_history


In [None]:
#--------------------------
# Evaluation Metrics and Functions
#--------------------------
"""
This section implements evaluation metrics and functions for our models.
It includes IoU calculation, Average Precision, Average Recall, and mAP.
"""

#--------------------------
# IoU Calculation
#--------------------------
def calculate_iou(box1, box2):
    """
    Calculate Intersection over Union (IoU) between two bounding boxes.

    IoU is a measure of the overlap between two bounding boxes, calculated as
    the area of intersection divided by the area of union. It ranges from 0
    (no overlap) to 1 (perfect overlap).

    Args:
        box1 (array-like): First box in format [x1, y1, x2, y2]
                          where (x1, y1) is the top-left corner and
                          (x2, y2) is the bottom-right corner
        box2 (array-like): Second box in same format as box1

    Returns:
        float: IoU value between 0 and 1
    """
    #--------------------------
    # Find Intersection Coordinates
    #--------------------------
    # Get the coordinates of the intersection rectangle
    # The intersection rectangle is formed by the maximum of the top-left coordinates
    # and the minimum of the bottom-right coordinates
    x1 = max(box1[0], box2[0])  # Max of left coordinates
    y1 = max(box1[1], box2[1])  # Max of top coordinates
    x2 = min(box1[2], box2[2])  # Min of right coordinates
    y2 = min(box1[3], box2[3])  # Min of bottom coordinates

    #--------------------------
    # Calculate Intersection Area
    #--------------------------
    # If there's no overlap, width or height will be negative
    width = max(0, x2 - x1)   # Width of intersection
    height = max(0, y2 - y1)  # Height of intersection
    intersection = width * height  # Area of intersection

    #--------------------------
    # Calculate Union Area
    #--------------------------
    # Calculate area of both boxes
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])  # Width * Height of box1
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])  # Width * Height of box2

    # Union = Sum of areas - Intersection
    union = box1_area + box2_area - intersection

    #--------------------------
    # Calculate IoU
    #--------------------------
    # Handle edge case where union is 0
    iou = intersection / union if union > 0 else 0

    return iou

#--------------------------
# Precision and Recall Calculation
#--------------------------
def calculate_precision_recall(pred_boxes, pred_labels, pred_scores, gt_boxes, gt_labels, iou_threshold=0.5):
    """
    Calculate precision, recall, and Average Precision (AP) for a single image.

    This function evaluates object detection performance by comparing predicted
    bounding boxes with ground truth boxes. It calculates precision (how many
    detected objects are correct) and recall (how many actual objects are detected),
    as well as Average Precision which summarizes the precision-recall curve.

    Args:
        pred_boxes (numpy.ndarray): Predicted bounding boxes in format [x1, y1, x2, y2]
        pred_labels (numpy.ndarray): Predicted class labels
        pred_scores (numpy.ndarray): Predicted confidence scores
        gt_boxes (numpy.ndarray): Ground truth bounding boxes in format [x1, y1, x2, y2]
        gt_labels (numpy.ndarray): Ground truth class labels
        iou_threshold (float): IoU threshold for considering a detection as correct (true positive)

    Returns:
        tuple: (precision, recall, ap)
            - precision (numpy.ndarray): Precision values at different detection thresholds
            - recall (numpy.ndarray): Recall values at different detection thresholds
            - ap (float): Average Precision value
    """
    #--------------------------
    # Sort Predictions by Confidence
    #--------------------------
    # Sort predictions by score (highest confidence first)
    sorted_indices = np.argsort(-pred_scores)
    pred_boxes = pred_boxes[sorted_indices]
    pred_labels = pred_labels[sorted_indices]
    pred_scores = pred_scores[sorted_indices]

    #--------------------------
    # Initialize Evaluation Arrays
    #--------------------------
    # Arrays to track true positives and false positives for each prediction
    tp = np.zeros(len(pred_boxes))  # True positives
    fp = np.zeros(len(pred_boxes))  # False positives

    # Track which ground truth boxes have been matched to avoid double-counting
    gt_matched = np.zeros(len(gt_boxes))

    #--------------------------
    # Evaluate Each Prediction
    #--------------------------
    # For each predicted box (in order of decreasing confidence)
    for i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)):
        # Find ground truth boxes with the same class label
        same_label_indices = np.where(gt_labels == pred_label)[0]

        # If there are no ground truth boxes with this label, it's a false positive
        if len(same_label_indices) == 0:
            fp[i] = 1
            continue

        #--------------------------
        # Find Best Matching Ground Truth Box
        #--------------------------
        max_iou = -np.inf  # Initialize maximum IoU
        max_idx = -1       # Initialize index of best matching ground truth box

        # Check each ground truth box with the same label
        for idx in same_label_indices:
            # Skip already matched ground truth boxes
            if gt_matched[idx]:
                continue

            # Calculate IoU between prediction and this ground truth box
            iou = calculate_iou(pred_box, gt_boxes[idx])

            # Update if this is the best match so far
            if iou > max_iou:
                max_iou = iou
                max_idx = idx

        #--------------------------
        # Determine True/False Positive
        #--------------------------
        # If the best match has IoU above threshold, count as true positive
        if max_iou >= iou_threshold and max_idx >= 0:
            tp[i] = 1  # Mark as true positive
            gt_matched[max_idx] = 1  # Mark this ground truth as matched
        else:
            fp[i] = 1  # Mark as false positive

    #--------------------------
    # Calculate Precision and Recall
    #--------------------------
    # Calculate cumulative true positives and false positives
    cum_tp = np.cumsum(tp)  # Cumulative true positives
    cum_fp = np.cumsum(fp)  # Cumulative false positives

    # Calculate precision: TP / (TP + FP)
    precision = cum_tp / (cum_tp + cum_fp)

    # Calculate recall: TP / Total ground truth
    # Handle case where there are no ground truth boxes
    recall = cum_tp / len(gt_boxes) if len(gt_boxes) > 0 else np.zeros_like(cum_tp)

    #--------------------------
    # Calculate Average Precision
    #--------------------------
    # Use 11-point interpolation method (standard in object detection)
    ap = 0
    for t in np.arange(0, 1.1, 0.1):  # 11 points from 0 to 1
        if np.sum(recall >= t) == 0:
            p = 0  # No recall at this threshold
        else:
            # Maximum precision at recalls >= t
            p = np.max(precision[recall >= t])
        ap += p / 11  # Average over 11 points

    return precision, recall, ap

#--------------------------
# Mean Average Precision (mAP) Calculation
#--------------------------
def calculate_map(all_pred_boxes, all_pred_labels, all_pred_scores, all_gt_boxes, all_gt_labels, num_classes, iou_threshold=0.5):
    """
    Calculate mean Average Precision (mAP) across all classes.

    mAP is the standard evaluation metric for object detection. It calculates the Average
    Precision (AP) for each class separately and then averages them to get the mAP.
    This function processes predictions and ground truths across multiple images.

    Args:
        all_pred_boxes (list): List of arrays of predicted bounding boxes for each image
        all_pred_labels (list): List of arrays of predicted class labels for each image
        all_pred_scores (list): List of arrays of predicted confidence scores for each image
        all_gt_boxes (list): List of arrays of ground truth bounding boxes for each image
        all_gt_labels (list): List of arrays of ground truth class labels for each image
        num_classes (int): Number of classes (including background)
        iou_threshold (float): IoU threshold for considering a detection as correct

    Returns:
        tuple: (ap_per_class, mAP)
            - ap_per_class (numpy.ndarray): Average Precision for each class
            - mAP (float): Mean Average Precision across all classes
    """
    #--------------------------
    # Initialize Results Array
    #--------------------------
    # Array to store Average Precision for each class
    ap_per_class = np.zeros(num_classes)

    #--------------------------
    # Process Each Class
    #--------------------------
    # Skip background class (index 0) and calculate AP for each object class
    for c in range(1, num_classes):
        #--------------------------
        # Collect Class-Specific Predictions and Ground Truths
        #--------------------------
        # Lists to store predictions and ground truths for this class across all images
        class_pred_boxes = []
        class_pred_scores = []
        class_gt_boxes = []

        # Process each image
        for i in range(len(all_pred_boxes)):
            # Get predictions for this class in this image
            class_mask = all_pred_labels[i] == c
            class_pred_boxes.append(all_pred_boxes[i][class_mask])
            class_pred_scores.append(all_pred_scores[i][class_mask])

            # Get ground truths for this class in this image
            class_gt_mask = all_gt_labels[i] == c
            class_gt_boxes.append(all_gt_boxes[i][class_gt_mask])

        #--------------------------
        # Skip Classes with No Ground Truth
        #--------------------------
        # If there are no ground truth boxes for this class, AP is 0
        if sum(len(boxes) for boxes in class_gt_boxes) == 0:
            ap_per_class[c] = 0
            continue

        #--------------------------
        # Prepare Data for AP Calculation
        #--------------------------
        # Concatenate predictions and ground truths from all images
        all_class_pred_boxes = np.concatenate(class_pred_boxes) if class_pred_boxes else np.array([])
        all_class_pred_scores = np.concatenate(class_pred_scores) if class_pred_scores else np.array([])
        all_class_gt_boxes = np.concatenate(class_gt_boxes) if class_gt_boxes else np.array([])

        # Sort predictions by confidence score (highest first)
        sorted_indices = np.argsort(-all_class_pred_scores)
        all_class_pred_boxes = all_class_pred_boxes[sorted_indices]
        all_class_pred_scores = all_class_pred_scores[sorted_indices]

        #--------------------------
        # Evaluate Predictions
        #--------------------------
        # Arrays to track true positives and false positives
        tp = np.zeros(len(all_class_pred_boxes))
        fp = np.zeros(len(all_class_pred_boxes))

        # For each predicted box
        for i, pred_box in enumerate(all_class_pred_boxes):
            # Find the ground truth box with the highest IoU
            max_iou = -np.inf
            max_idx = -1

            for j, gt_box in enumerate(all_class_gt_boxes):
                iou = calculate_iou(pred_box, gt_box)
                if iou > max_iou:
                    max_iou = iou
                    max_idx = j

            # Determine if prediction is a true positive or false positive
            if max_iou >= iou_threshold:
                tp[i] = 1  # True positive
            else:
                fp[i] = 1  # False positive

        #--------------------------
        # Calculate Precision and Recall
        #--------------------------
        # Calculate cumulative true positives and false positives
        cum_tp = np.cumsum(tp)
        cum_fp = np.cumsum(fp)

        # Calculate precision and recall at each detection threshold
        precision = cum_tp / (cum_tp + cum_fp)
        recall = cum_tp / len(all_class_gt_boxes)

        #--------------------------
        # Calculate Average Precision
        #--------------------------
        # Use 11-point interpolation method (standard in object detection)
        ap = 0
        for t in np.arange(0, 1.1, 0.1):  # 11 points from 0 to 1
            if np.sum(recall >= t) == 0:
                p = 0  # No recall at this threshold
            else:
                # Maximum precision at recalls >= t
                p = np.max(precision[recall >= t])
            ap += p / 11  # Average over 11 points

        # Store AP for this class
        ap_per_class[c] = ap

    #--------------------------
    # Calculate Mean Average Precision
    #--------------------------
    # Average the AP values across all classes (excluding background)
    mAP = np.mean(ap_per_class[1:])

    return ap_per_class, mAP

#--------------------------
# Model Evaluation Function
#--------------------------
"""
This section implements the evaluation function for our models.
It calculates metrics like mAP (mean Average Precision) on the test set
and visualizes predictions compared to ground truth.
"""

def evaluate_model(model, test_loader, device=device):
    """
    Evaluate the object detection model on the test set.

    This function performs a comprehensive evaluation of the model by:
    1. Calculating test loss
    2. Computing mAP (mean Average Precision) across all classes
    3. Visualizing predictions vs ground truth

    Args:
        model (torch.nn.Module): Model to evaluate
        test_loader (DataLoader): DataLoader for test data
        device (torch.device): Device to evaluate on ('cpu' or 'cuda')

    Returns:
        dict: Dictionary containing evaluation results:
            - test_loss: Average loss on test set
            - mAP: Mean Average Precision
            - ap_per_class: Average Precision for each class
    """
    #--------------------------
    # Setup
    #--------------------------
    # Move model to the specified device and set to evaluation mode
    model.to(device)
    model.eval()

    # Initialize test loss
    test_loss = 0

    #--------------------------
    # Collect Predictions and Ground Truths
    #--------------------------
    # Lists to store predictions and ground truths for mAP calculation
    all_pred_boxes = []    # Predicted bounding boxes
    all_pred_labels = []   # Predicted class labels
    all_pred_scores = []   # Predicted confidence scores
    all_gt_boxes = []      # Ground truth bounding boxes
    all_gt_labels = []     # Ground truth class labels

    # Disable gradient calculation for evaluation
    with torch.no_grad():
        # Iterate through batches in the test dataloader
        for images, targets in test_loader:
            # Move data to the specified device
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            #--------------------------
            # Calculate Test Loss
            #--------------------------
            # Forward pass with targets to get loss
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            test_loss += losses.item()

            #--------------------------
            # Get Model Predictions
            #--------------------------
            # Forward pass without targets to get predictions
            predictions = model(images)

            # Convert predictions and targets to CPU for evaluation
            predictions = [{k: v.cpu() for k, v in p.items()} for p in predictions]
            targets = [{k: v.cpu() for k, v in t.items()} for t in targets]

            #--------------------------
            # Process Each Image in Batch
            #--------------------------
            # Store predictions and ground truths for mAP calculation
            for pred, target in zip(predictions, targets):
                # Filter predictions with confidence score > 0.5
                mask = pred['scores'] > 0.5
                pred_boxes = pred['boxes'][mask].numpy()
                pred_labels = pred['labels'][mask].numpy()
                pred_scores = pred['scores'][mask].numpy()

                # Get ground truths
                gt_boxes = target['boxes'].numpy()
                gt_labels = target['labels'].numpy()

                # Add to collection lists
                all_pred_boxes.append(pred_boxes)
                all_pred_labels.append(pred_labels)
                all_pred_scores.append(pred_scores)
                all_gt_boxes.append(gt_boxes)
                all_gt_labels.append(gt_labels)

    #--------------------------
    # Calculate Metrics
    #--------------------------
    # Calculate average test loss
    avg_test_loss = test_loss / len(test_loader)
    print(f'Test Loss: {avg_test_loss:.4f}')

    # Define class names for reporting
    class_names = {
        1: 'Pedestrian',
        2: 'Person',
        3: 'Car',
        4: 'Van',
        5: 'Bus',
        6: 'Truck',
        7: 'Motor',
        8: 'Bicycle',
        9: 'Awning-tricycle',
        10: 'Tricycle',
        11: 'Other'
    }

    #--------------------------
    # Calculate mAP
    #--------------------------
    # Number of classes (11 object classes + background)
    num_classes = 12

    # Calculate Average Precision for each class and mean AP
    ap_per_class, mAP = calculate_map(
        all_pred_boxes, all_pred_labels, all_pred_scores,
        all_gt_boxes, all_gt_labels, num_classes
    )

    # Print mAP
    print(f'mAP: {mAP:.4f}')

    # Print AP for each class
    print("Average Precision per class:")
    for c in range(1, num_classes):
        print(f'{class_names[c]}: {ap_per_class[c]:.4f}')

    #--------------------------
    # Visualize Predictions
    #--------------------------
    # Get a batch of images for visualization
    model.eval()
    images, targets = next(iter(test_loader))
    images = [img.to(device) for img in images]

    # Get predictions
    with torch.no_grad():
        predictions = model(images)

    # Convert to CPU for visualization
    images = [img.cpu() for img in images]
    predictions = [{k: v.cpu() for k, v in p.items()} for p in predictions]
    targets = [{k: v.cpu() for k, v in t.items()} for t in targets]

    #--------------------------
    # Prepare Visualization Data
    #--------------------------
    # Use the first image in the batch for visualization
    image = images[0].permute(1, 2, 0).numpy()  # Convert from CHW to HWC format

    # Get predictions for the first image
    pred_boxes = predictions[0]['boxes'].numpy()
    pred_scores = predictions[0]['scores'].numpy()
    pred_labels = predictions[0]['labels'].numpy()

    # Get ground truths for the first image
    gt_boxes = targets[0]['boxes'].numpy()
    gt_labels = targets[0]['labels'].numpy()

    # Filter predictions with confidence score > 0.5
    mask = pred_scores > 0.5
    pred_boxes = pred_boxes[mask]
    pred_labels = pred_labels[mask]
    pred_scores = pred_scores[mask]

    #--------------------------
    # Calculate IoU for Visualization
    #--------------------------
    # Calculate IoU between each prediction and the best matching ground truth
    ious = []
    for pred_box in pred_boxes:
        max_iou = 0
        for gt_box in gt_boxes:
            iou = calculate_iou(pred_box, gt_box)
            max_iou = max(max_iou, iou)
        ious.append(max_iou)

    #--------------------------
    # Create Visualization
    #--------------------------
    # Create figure for visualization
    plt.figure(figsize=(10, 10))
    plt.imshow(image)

    # Draw ground truth boxes (green)
    for box, label in zip(gt_boxes, gt_labels):
        x1, y1, x2, y2 = box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1, 
            linewidth=2, 
            edgecolor='g', 
            facecolor='none'
        )
        plt.gca().add_patch(rect)

        # Add label text
        class_name = class_names.get(label.item(), f'Class {label.item()}')
        plt.text(
            x1, y1-5, 
            f'GT: {class_name}', 
            color='white', 
            bbox=dict(facecolor='green', alpha=0.5)
        )

    # Draw predicted boxes (red)
    for box, label, score, iou in zip(pred_boxes, pred_labels, pred_scores, ious):
        x1, y1, x2, y2 = box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1, 
            linewidth=2, 
            edgecolor='r', 
            facecolor='none'
        )
        plt.gca().add_patch(rect)

        # Add label text with confidence score and IoU
        class_name = class_names.get(label.item(), f'Class {label.item()}')
        plt.text(
            x1, y1+15, 
            f'Pred: {class_name}: {score:.2f}, IoU: {iou:.2f}', 
            color='white', 
            bbox=dict(facecolor='red', alpha=0.5)
        )

    # Finalize visualization
    plt.axis('off')
    plt.title('Model Predictions vs Ground Truth')
    plt.show()

    #--------------------------
    # Return Results
    #--------------------------
    return {
        'test_loss': avg_test_loss,
        'mAP': mAP,
        'ap_per_class': ap_per_class
    }


In [None]:
#--------------------------
# Main Execution
#--------------------------
"""
This section demonstrates how to use the functions defined above to:
1. Load and prepare the data
2. Create and train the baseline model
3. Create and train the custom model
4. Evaluate both models on the test set
5. Compare results and visualize performance
"""

# Example usage (commented out to avoid execution)
"""
#--------------------------
# 1. Data Loading and Preparation
#--------------------------
# Load training data with negative examples (images with no objects)
# Negative examples help the model learn to distinguish between objects and background
images_np_train, annotations_np_train = load_and_process_data(
    dataset_dir=dataset_dir,
    subset='train',
    max_samples=100,              # Limit samples for faster execution (remove for full training)
    include_negative=True         # Include images with no valid annotations
)

# Load validation data (no negative examples needed)
images_np_val, annotations_np_val = load_and_process_data(
    dataset_dir=dataset_dir,
    subset='val',
    max_samples=50,               # Limit samples for faster execution
    include_negative=False
)

# Load test data (no negative examples needed)
images_np_test, annotations_np_test = load_and_process_data(
    dataset_dir=dataset_dir,
    subset='test',
    max_samples=50,               # Limit samples for faster execution
    include_negative=False
)

#--------------------------
# Create DataLoaders
#--------------------------
# Create dataloaders with data augmentation for the training set
train_loader, valid_loader, test_loader = create_dataloaders(
    images_np_train=images_np_train,
    annotations_np_train=annotations_np_train,
    images_np_val=images_np_val,
    annotations_np_val=annotations_np_val,
    images_np_test=images_np_test,
    annotations_np_test=annotations_np_test,
    batch_size=4,                 # Adjust based on available GPU memory
    augment_train=True            # Apply data augmentation to the training set
)

#--------------------------
# 2. Baseline Model Training
#--------------------------
# Define number of classes (11 object classes + background)
num_classes = 12

# Create baseline model (pre-trained Faster R-CNN with ResNet-50)
baseline_model = get_baseline_model(num_classes)

# Train baseline model
baseline_model, baseline_train_loss, baseline_val_loss = train_model(
    model=baseline_model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    num_epochs=5,                 # Train for 5 epochs
    lr=0.005                      # Initial learning rate
)

#--------------------------
# 3. Custom Model Training
#--------------------------
# Create custom model with improved architecture
custom_model = get_custom_model(
    num_classes=num_classes,
    backbone_name='resnet50',     # Can also try 'resnet101' for potentially better results
    trainable_backbone_layers=3   # Number of backbone layers to fine-tune
)

# Train custom model
custom_model, custom_train_loss, custom_val_loss = train_model(
    model=custom_model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    num_epochs=10,                # Train for more epochs than baseline
    lr=0.005                      # Initial learning rate
)

#--------------------------
# 4. Model Evaluation
#--------------------------
# Evaluate baseline model on test set
baseline_results = evaluate_model(
    model=baseline_model,
    test_loader=test_loader
)

# Evaluate custom model on test set
custom_results = evaluate_model(
    model=custom_model,
    test_loader=test_loader
)

#--------------------------
# 5. Results Comparison
#--------------------------
# Print overall metrics
print("Baseline model test loss:", baseline_results['test_loss'])
print("Baseline model mAP:", baseline_results['mAP'])
print("Custom model test loss:", custom_results['test_loss'])
print("Custom model mAP:", custom_results['mAP'])

# Define class names for reporting
class_names = {
    1: 'Pedestrian',
    2: 'Person',
    3: 'Car',
    4: 'Van',
    5: 'Bus',
    6: 'Truck',
    7: 'Motor',
    8: 'Bicycle',
    9: 'Awning-tricycle',
    10: 'Tricycle',
    11: 'Other'
}

# Compare Average Precision per class
print("\nAverage Precision per class comparison:")
print("Class\t\tBaseline AP\tCustom AP\tImprovement")
print("-" * 60)

# Print AP for each class and the improvement
for c in range(1, num_classes):
    class_name = class_names[c]
    baseline_ap = baseline_results['ap_per_class'][c]
    custom_ap = custom_results['ap_per_class'][c]
    improvement = custom_ap - baseline_ap
    print(f"{class_name:<15}\t{baseline_ap:.4f}\t\t{custom_ap:.4f}\t\t{improvement:+.4f}")

#--------------------------
# 6. Visualization
#--------------------------
# Plot training and validation loss curves
plt.figure(figsize=(10, 5))

# Baseline model loss plot
plt.subplot(1, 2, 1)
plt.plot(baseline_train_loss, label='Train Loss')
plt.plot(baseline_val_loss, label='Val Loss')
plt.title('Baseline Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Custom model loss plot
plt.subplot(1, 2, 2)
plt.plot(custom_train_loss, label='Train Loss')
plt.plot(custom_val_loss, label='Val Loss')
plt.title('Custom Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()
"""
