# Necessary Imports
Import all required libraries including PyTorch, torchvision, matplotlib, numpy, PIL, and other utilities for web crawling, image processing, and model evaluation.

In [None]:
# Import necessary libraries for the project

# PyTorch and torchvision for deep learning
import torch
import torchvision
from torch import nn, optim
from torchvision import transforms, models

# Utilities for data handling and visualization
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageDraw

# Web crawling and image downloading
import os
import random
import requests
from icrawler.builtin import GoogleImageCrawler

# Utility for debugging and model summary
from torchsummary import summary

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Object Loading
Load character images (like Waldo, Wilma, and Wenda), remove their backgrounds, and prepare them as transparent PNG files. Visualize the processed objects to confirm proper preparation.

In [None]:
# Define directories for storing original and processed object images
object_dir = "objects/original"
processed_dir = "objects/processed"
os.makedirs(object_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# URLs for character images
character_urls = {
    "waldo": "https://static.wikia.nocookie.net/waldo/images/9/9d/Character.Waldo.jpg",
    "wilma": "https://static.wikia.nocookie.net/waldo/images/8/86/Character.Wilma.jpg",
    "wenda": "https://static.wikia.nocookie.net/waldo/images/3/3e/Character.Wenda.jpg"
}

# Function to download and process character images
def download_and_process_character(character, url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        img_path = os.path.join(object_dir, f"{character}.jpg")
        with open(img_path, "wb") as f:
            f.write(response.content)

        # Open the image and convert to RGBA
        img = Image.open(img_path).convert("RGBA")
        data = np.array(img)

        # Remove white/light backgrounds by setting alpha to 0
        r, g, b, a = data.T
        white_areas = (r > 200) & (g > 200) & (b > 200)
        data[..., 3][white_areas.T] = 0

        # Save the processed image as a transparent PNG
        processed_img = Image.fromarray(data)
        processed_path = os.path.join(processed_dir, f"{character}.png")
        processed_img.save(processed_path)
        print(f"✅ Processed {character} successfully.")
    except Exception as e:
        print(f"❌ Failed to process {character}: {e}")

# Download and process all characters
for character, url in character_urls.items():
    download_and_process_character(character, url)

# Visualize the processed objects
def visualize_processed_objects():
    processed_images = [
        Image.open(os.path.join(processed_dir, f)).convert("RGBA")
        for f in os.listdir(processed_dir) if f.endswith(".png")
    ]
    object_names = [os.path.splitext(f)[0] for f in os.listdir(processed_dir) if f.endswith(".png")]

    plt.figure(figsize=(15, 5))
    for i, (img, name) in enumerate(zip(processed_images, object_names)):
        plt.subplot(1, len(processed_images), i + 1)
        plt.imshow(img)
        plt.title(name)
        plt.axis("off")
    plt.tight_layout()
    plt.show()

# Visualize the objects
visualize_processed_objects()

# Web Crawling for Background Images
Use the icrawler library to download background images from Google. Create a function to display sample images from the collection to verify proper downloading.

In [None]:
# Define directory for storing background images
background_images_dir = "backgrounds"
os.makedirs(background_images_dir, exist_ok=True)

# Function to download background images using GoogleImageCrawler
def download_background_images(keywords, max_images_per_keyword=50, output_dir=background_images_dir):
    """
    Downloads background images from Google using specified keywords.

    Args:
        keywords (list): List of search keywords for background images.
        max_images_per_keyword (int): Maximum number of images to download per keyword.
        output_dir (str): Directory to save the downloaded images.
    """
    google_crawler = GoogleImageCrawler(storage={"root_dir": output_dir})
    for keyword in keywords:
        google_crawler.crawl(keyword=keyword, max_num=max_images_per_keyword)
    print(f"✅ Downloaded background images for keywords: {keywords}")

# Function to visualize a sample of downloaded background images
def visualize_background_images(directory, num_samples=10):
    """
    Visualizes a sample of background images from the specified directory.

    Args:
        directory (str): Directory containing the background images.
        num_samples (int): Number of images to display.
    """
    image_files = [
        os.path.join(directory, f) for f in os.listdir(directory)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    if not image_files:
        print("❌ No images found in the directory.")
        return

    # Randomly select images to display
    sampled_images = random.sample(image_files, min(num_samples, len(image_files)))

    plt.figure(figsize=(15, 5))
    for i, img_path in enumerate(sampled_images):
        img = Image.open(img_path)
        plt.subplot(1, len(sampled_images), i + 1)
        plt.imshow(img)
        plt.title(f"Image {i + 1}")
        plt.axis("off")
    plt.tight_layout()
    plt.show()

# Define keywords for background images
background_keywords = ["doodle background", "cluttered background", "abstract art background"]

# Download background images
download_background_images(background_keywords)

# Visualize a sample of the downloaded background images
visualize_background_images(background_images_dir)

# Creating a Synthetic Dataset
Generate a synthetic dataset by placing objects on backgrounds at random positions and scales. Save images with corresponding YOLO-format labels (class_id, x_center, y_center, width, height) for each object placement.

In [None]:
# Function to create a synthetic dataset
def create_synthetic_dataset(background_dir, object_dir, output_dir, split, img_size=(640, 640), num_images=1000):
    """
    Create a synthetic dataset by placing objects on backgrounds.

    Args:
        background_dir (str): Directory containing background images.
        object_dir (str): Directory containing object images with transparency.
        output_dir (str): Root directory to save the dataset.
        split (str): Dataset split ('train', 'val', or 'test').
        img_size (tuple): Size of output images (width, height).
        num_images (int): Number of images to generate.
    """
    # Create directories for images and labels
    dataset_dir = os.path.join(output_dir, split)
    images_dir = os.path.join(dataset_dir, "images")
    labels_dir = os.path.join(dataset_dir, "labels")
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(labels_dir, exist_ok=True)

    # Load background and object images
    background_paths = [
        os.path.join(background_dir, f) for f in os.listdir(background_dir)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]
    object_paths = [
        os.path.join(object_dir, f) for f in os.listdir(object_dir)
        if f.lower().endswith(".png")
    ]
    object_names = [os.path.splitext(os.path.basename(f))[0] for f in object_paths]

    if not background_paths or not object_paths:
        print("❌ No background or object images found.")
        return

    print(f"🏞️ Found {len(background_paths)} background images and {len(object_paths)} objects.")
    print(f"🎯 Generating {num_images} synthetic images for {split} set...")

    for i in range(num_images):
        # Select a random background and object
        bg_path = random.choice(background_paths)
        obj_path = random.choice(object_paths)
        obj_idx = object_paths.index(obj_path)

        # Load and resize background
        background = Image.open(bg_path).convert("RGB").resize(img_size)

        # Load and resize object
        obj_image = Image.open(obj_path).convert("RGBA")
        scale_factor = random.uniform(0.1, 0.3)  # Object size as a fraction of image size
        obj_width = int(img_size[0] * scale_factor)
        obj_height = int(obj_width * (obj_image.height / obj_image.width))
        obj_image = obj_image.resize((obj_width, obj_height), Image.LANCZOS)

        # Randomly position the object on the background
        max_x = img_size[0] - obj_width
        max_y = img_size[1] - obj_height
        x_pos = random.randint(0, max_x)
        y_pos = random.randint(0, max_y)

        # Paste the object onto the background
        background.paste(obj_image, (x_pos, y_pos), obj_image)

        # Calculate YOLO-format bounding box
        x_center = (x_pos + obj_width / 2) / img_size[0]
        y_center = (y_pos + obj_height / 2) / img_size[1]
        width = obj_width / img_size[0]
        height = obj_height / img_size[1]

        # Save the synthetic image
        img_filename = f"{i:05d}.jpg"
        background.save(os.path.join(images_dir, img_filename))

        # Save the label in YOLO format
        label_filename = f"{i:05d}.txt"
        with open(os.path.join(labels_dir, label_filename), "w") as f:
            f.write(f"{obj_idx} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

        # Print progress
        if (i + 1) % 100 == 0 or i == num_images - 1:
            print(f"  Progress: {i + 1}/{num_images} images created.")

    print(f"✅ Synthetic dataset for {split} set created successfully!")

# Create train, validation, and test datasets
output_dir = "synthetic_dataset"
create_synthetic_dataset(background_images_dir, processed_dir, output_dir, "train", num_images=5000)
create_synthetic_dataset(background_images_dir, processed_dir, output_dir, "val", num_images=1000)
create_synthetic_dataset(background_images_dir, processed_dir, output_dir, "test", num_images=500)

# Creating Data Loaders
Create PyTorch Dataset and DataLoader classes to handle the synthetic dataset. Implement transformations for training, validation, and test sets including normalization and data augmentation.

In [None]:
# Define the PyTorch Dataset class for object detection
class ObjectDetectionDataset(Dataset):
    def __init__(self, root_dir, split, transform=None):
        """
        Initialize the dataset.

        Args:
            root_dir (str): Root directory of the dataset.
            split (str): Dataset split ('train', 'val', or 'test').
            transform (callable, optional): Transformations to apply to the images.
        """
        self.root_dir = root_dir
        self.split = split
        self.transform = transform

        # Define paths for images and labels
        self.images_dir = os.path.join(root_dir, split, "images")
        self.labels_dir = os.path.join(root_dir, split, "labels")

        # Get list of image files
        self.image_files = sorted([
            f for f in os.listdir(self.images_dir)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.images_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")

        # Load corresponding label
        label_path = os.path.join(self.labels_dir, os.path.splitext(self.image_files[idx])[0] + ".txt")
        with open(label_path, "r") as f:
            label_data = f.readline().strip().split()
            class_id = int(label_data[0])
            bbox = torch.tensor([float(x) for x in label_data[1:]])

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        return image, bbox, class_id

# Define transformations for training, validation, and test sets
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset objects for each split
dataset_dir = "synthetic_dataset"
train_dataset = ObjectDetectionDataset(dataset_dir, "train", transform=train_transform)
val_dataset = ObjectDetectionDataset(dataset_dir, "val", transform=val_transform)
test_dataset = ObjectDetectionDataset(dataset_dir, "test", transform=test_transform)

# Create DataLoaders for each split
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Print dataset and DataLoader information
print(f"📊 Dataset Information:")
print(f"  • Training set: {len(train_dataset)} images")
print(f"  • Validation set: {len(val_dataset)} images")
print(f"  • Test set: {len(test_dataset)} images")

print(f"\n🔄 DataLoader Information:")
print(f"  • Training batches: {len(train_loader)}")
print(f"  • Validation batches: {len(val_loader)}")
print(f"  • Test batches: {len(test_loader)}")

# Visualizing Training Data
Create functions to visualize batches of training data with bounding boxes and class labels to verify correct dataset creation and loading.

In [None]:
# Function to visualize a batch of training data with bounding boxes and class labels
def visualize_training_data(dataloader, class_names, num_samples=8):
    """
    Visualize a batch of training data with bounding boxes and class labels.

    Args:
        dataloader (DataLoader): DataLoader for the dataset.
        class_names (list): List of class names corresponding to class IDs.
        num_samples (int): Number of samples to visualize.
    """
    # Get a batch of data
    images, bboxes, class_ids = next(iter(dataloader))

    # Limit the number of samples to visualize
    num_samples = min(num_samples, len(images))

    # Create a grid for visualization
    plt.figure(figsize=(15, 5))
    for i in range(num_samples):
        # Denormalize the image
        img = images[i].permute(1, 2, 0).numpy()
        img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
        img = np.clip(img, 0, 1)

        # Extract bounding box and class ID
        bbox = bboxes[i].numpy()
        class_id = class_ids[i].item()

        # Calculate bounding box corners
        x_center, y_center, width, height = bbox
        x_min = (x_center - width / 2) * img.shape[1]
        y_min = (y_center - height / 2) * img.shape[0]
        x_max = (x_center + width / 2) * img.shape[1]
        y_max = (y_center + height / 2) * img.shape[0]

        # Plot the image
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(img)
        plt.title(f"Class: {class_names[class_id]}")
        plt.axis("off")

        # Draw the bounding box
        rect = plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min,
                              fill=False, edgecolor='red', linewidth=2)
        plt.gca().add_patch(rect)

    plt.tight_layout()
    plt.show()

# Visualize a batch of training data
class_names = ["waldo", "wilma", "wenda"]  # Replace with actual class names
visualize_training_data(train_loader, class_names)

# Creating a Custom Object Detection Model
Implement a custom object detection model using a pre-trained ResNet backbone with custom classification and regression heads for object detection. Initialize model weights and output model summary.

In [None]:
# Define the custom object detection model
class CustomObjectDetectionModel(nn.Module):
    def __init__(self, num_classes=3, pretrained=True):
        """
        Initialize the custom object detection model.

        Args:
            num_classes (int): Number of object classes.
            pretrained (bool): Whether to use a pre-trained ResNet backbone.
        """
        super(CustomObjectDetectionModel, self).__init__()
        
        # Use a pre-trained ResNet18 as the backbone
        resnet = models.resnet18(pretrained=pretrained)
        self.backbone = nn.Sequential(*list(resnet.children())[:-2])  # Remove the fully connected layers
        
        # Classification head
        self.classification_head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512 * 20 * 20, 256),  # Adjust input size based on feature map dimensions
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
        
        # Regression head
        self.regression_head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512 * 20 * 20, 256),  # Adjust input size based on feature map dimensions
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 4),  # Output: [x_center, y_center, width, height]
            nn.Sigmoid()  # Normalize bounding box coordinates
        )
        
        # Initialize weights for custom layers
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Initialize weights for custom layers.
        """
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Class logits.
            torch.Tensor: Bounding box predictions.
        """
        features = self.backbone(x)
        class_logits = self.classification_head(features)
        bbox_preds = self.regression_head(features)
        return class_logits, bbox_preds

# Instantiate the model
num_classes = len(class_names)  # Number of object classes
model = CustomObjectDetectionModel(num_classes=num_classes, pretrained=True).to(device)

# Print model summary
input_size = (3, 640, 640)  # Input size: 3 channels (RGB), 640x640 resolution
summary(model, input_size=input_size, device=str(device))

# Defining Loss Functions and Optimizer
Define appropriate loss functions for classification (CrossEntropyLoss) and bounding box regression (SmoothL1Loss). Create a combined loss function and configure the Adam optimizer with learning rate scheduling.

In [None]:
# Define the loss functions
classification_loss_fn = nn.CrossEntropyLoss()  # For class probabilities
regression_loss_fn = nn.SmoothL1Loss()  # For bounding box regression

# Define the optimizer
learning_rate = 0.001  # Learning rate for Adam optimizer
weight_decay = 1e-4  # L2 regularization to prevent overfitting
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Define a learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.1, 
    patience=5, 
    verbose=True
)

# Define a combined loss function
def combined_loss(class_pred, bbox_pred, class_target, bbox_target):
    """
    Calculate the combined loss for object detection.

    Args:
        class_pred (torch.Tensor): Predicted class scores [batch_size, num_classes].
        bbox_pred (torch.Tensor): Predicted bounding boxes [batch_size, 4].
        class_target (torch.Tensor): Ground truth class indices [batch_size].
        bbox_target (torch.Tensor): Ground truth bounding boxes [batch_size, 4].

    Returns:
        torch.Tensor: Total loss.
        torch.Tensor: Classification loss.
        torch.Tensor: Regression loss.
    """
    # Classification loss
    cls_loss = classification_loss_fn(class_pred, class_target)

    # Regression loss
    reg_loss = regression_loss_fn(bbox_pred, bbox_target)

    # Combine losses
    total_loss = cls_loss + reg_loss
    return total_loss, cls_loss, reg_loss

# Print the configuration
print("Loss Functions and Optimizer Configuration:")
print(f"  • Classification Loss: {classification_loss_fn}")
print(f"  • Regression Loss: {regression_loss_fn}")
print(f"  • Optimizer: {optimizer}")
print(f"  • Learning Rate: {learning_rate}")
print(f"  • Weight Decay: {weight_decay}")
print(f"  • LR Scheduler: ReduceLROnPlateau")

# Training the Custom Model
Implement a comprehensive training loop that handles both training and validation phases. Include early stopping, model checkpoint saving, and learning rate scheduling to prevent overfitting.

In [None]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, optimizer, scheduler, loss_fn, num_epochs=30, patience=5, device=device):
    """
    Train the custom object detection model with early stopping and learning rate scheduling.

    Args:
        model (nn.Module): The object detection model.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
        loss_fn (function): Combined loss function for classification and regression.
        num_epochs (int): Maximum number of epochs.
        patience (int): Early stopping patience.
        device (torch.device): Device to train on.

    Returns:
        dict: Training history containing losses and metrics.
    """
    best_val_loss = float('inf')
    early_stop_counter = 0
    history = {'train_loss': [], 'val_loss': [], 'train_cls_loss': [], 'val_cls_loss': [], 'train_reg_loss': [], 'val_reg_loss': []}

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        model.train()
        train_loss, train_cls_loss, train_reg_loss = 0.0, 0.0, 0.0

        # Training loop
        for images, bboxes, class_ids in tqdm(train_loader, desc="Training", leave=False):
            images, bboxes, class_ids = images.to(device), bboxes.to(device), class_ids.to(device)
            optimizer.zero_grad()
            class_pred, bbox_pred = model(images)
            loss, cls_loss, reg_loss = loss_fn(class_pred, bbox_pred, class_ids, bboxes)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_cls_loss += cls_loss.item()
            train_reg_loss += reg_loss.item()

        # Calculate average training losses
        train_loss /= len(train_loader)
        train_cls_loss /= len(train_loader)
        train_reg_loss /= len(train_loader)
        history['train_loss'].append(train_loss)
        history['train_cls_loss'].append(train_cls_loss)
        history['train_reg_loss'].append(train_reg_loss)

        # Validation loop
        model.eval()
        val_loss, val_cls_loss, val_reg_loss = 0.0, 0.0, 0.0
        with torch.no_grad():
            for images, bboxes, class_ids in tqdm(val_loader, desc="Validation", leave=False):
                images, bboxes, class_ids = images.to(device), bboxes.to(device), class_ids.to(device)
                class_pred, bbox_pred = model(images)
                loss, cls_loss, reg_loss = loss_fn(class_pred, bbox_pred, class_ids, bboxes)

                val_loss += loss.item()
                val_cls_loss += cls_loss.item()
                val_reg_loss += reg_loss.item()

        # Calculate average validation losses
        val_loss /= len(val_loader)
        val_cls_loss /= len(val_loader)
        val_reg_loss /= len(val_loader)
        history['val_loss'].append(val_loss)
        history['val_cls_loss'].append(val_cls_loss)
        history['val_reg_loss'].append(val_reg_loss)

        # Print epoch summary
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        print(f"Train CLS Loss: {train_cls_loss:.4f} | Val CLS Loss: {val_cls_loss:.4f}")
        print(f"Train REG Loss: {train_reg_loss:.4f} | Val REG Loss: {val_reg_loss:.4f}")

        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), "best_model.pth")
            print("✅ Model improved. Saved best model.")
        else:
            early_stop_counter += 1
            print(f"⚠️ No improvement. Early stopping counter: {early_stop_counter}/{patience}")

        # Early stopping
        if early_stop_counter >= patience:
            print("⏹️ Early stopping triggered.")
            break

        # Step the scheduler
        scheduler.step(val_loss)

    print("Training complete.")
    return history

# Train the model
history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    loss_fn=combined_loss,
    num_epochs=30,
    patience=5,
    device=device
)

# Plot training history
def plot_training_history(history):
    epochs = range(1, len(history['train_loss']) + 1)
    plt.figure(figsize=(12, 6))

    # Plot total loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_loss'], label="Train Loss")
    plt.plot(epochs, history['val_loss'], label="Val Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Total Loss")
    plt.legend()

    # Plot classification and regression losses
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['train_cls_loss'], label="Train CLS Loss")
    plt.plot(epochs, history['val_cls_loss'], label="Val CLS Loss")
    plt.plot(epochs, history['train_reg_loss'], label="Train REG Loss")
    plt.plot(epochs, history['val_reg_loss'], label="Val REG Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Classification and Regression Losses")
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_training_history(history)

# Visualizing Training Metrics
Plot training and validation losses over time including total loss, classification loss, and regression loss. Visualize metrics to analyze model convergence and performance.

In [None]:
# Function to visualize training and validation metrics
def plot_training_metrics(history):
    """
    Plot training and validation losses over epochs.

    Args:
        history (dict): Dictionary containing training and validation losses.
    """
    epochs = range(1, len(history['train_loss']) + 1)

    # Create a figure for the plots
    plt.figure(figsize=(14, 7))

    # Plot total loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_loss'], label="Training Loss", color="blue", marker="o")
    plt.plot(epochs, history['val_loss'], label="Validation Loss", color="orange", marker="o")
    plt.title("Total Loss Over Epochs", fontsize=14)
    plt.xlabel("Epochs", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True)

    # Plot classification and regression losses
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['train_cls_loss'], label="Train Classification Loss", color="green", marker="o")
    plt.plot(epochs, history['val_cls_loss'], label="Val Classification Loss", color="red", marker="o")
    plt.plot(epochs, history['train_reg_loss'], label="Train Regression Loss", color="purple", marker="o")
    plt.plot(epochs, history['val_reg_loss'], label="Val Regression Loss", color="brown", marker="o")
    plt.title("Classification and Regression Losses", fontsize=14)
    plt.xlabel("Epochs", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True)

    # Adjust layout and display the plots
    plt.tight_layout()
    plt.show()

# Call the function to visualize training metrics
plot_training_metrics(history)

# Running Inference on Test Data
Create an inference pipeline to run predictions on test data. Implement non-maximum suppression if needed and calculate performance metrics like precision, recall, and IoU.

In [None]:
from torchvision.ops import nms

def run_inference(model, dataloader, iou_threshold=0.5, device=device):
    """
    Run inference on the test dataset and calculate performance metrics.

    Args:
        model (nn.Module): Trained object detection model.
        dataloader (DataLoader): DataLoader for the test dataset.
        iou_threshold (float): IoU threshold for non-maximum suppression.
        device (torch.device): Device to run inference on.

    Returns:
        dict: Dictionary containing precision, recall, IoU, and predictions.
    """
    model.eval()
    all_predictions = []
    all_ground_truths = []

    with torch.no_grad():
        for images, bboxes, class_ids in tqdm(dataloader, desc="Running Inference"):
            images = images.to(device)
            bboxes = bboxes.to(device)
            class_ids = class_ids.to(device)

            # Forward pass
            class_logits, bbox_preds = model(images)

            # Convert logits to probabilities
            class_probs = torch.softmax(class_logits, dim=1)

            for i in range(images.size(0)):
                # Extract predictions for the current image
                probs = class_probs[i]
                preds = bbox_preds[i]

                # Apply non-maximum suppression
                scores, labels = probs.max(dim=0)
                keep = nms(preds, scores, iou_threshold)

                # Store predictions and ground truths
                all_predictions.append({
                    "boxes": preds[keep].cpu(),
                    "scores": scores[keep].cpu(),
                    "labels": labels[keep].cpu()
                })
                all_ground_truths.append({
                    "boxes": bboxes[i].cpu(),
                    "labels": class_ids[i].cpu()
                })

    return {"predictions": all_predictions, "ground_truths": all_ground_truths}

def calculate_metrics(predictions, ground_truths, iou_threshold=0.5):
    """
    Calculate precision, recall, and IoU for the predictions.

    Args:
        predictions (list): List of predicted bounding boxes and labels.
        ground_truths (list): List of ground truth bounding boxes and labels.
        iou_threshold (float): IoU threshold for matching predictions to ground truths.

    Returns:
        dict: Dictionary containing precision, recall, and IoU.
    """
    tp, fp, fn = 0, 0, 0
    iou_scores = []

    for pred, gt in zip(predictions, ground_truths):
        pred_boxes = pred["boxes"]
        pred_labels = pred["labels"]
        gt_boxes = gt["boxes"]
        gt_labels = gt["labels"]

        for i, pred_box in enumerate(pred_boxes):
            ious = torchvision.ops.box_iou(pred_box.unsqueeze(0), gt_boxes)
            max_iou, max_idx = ious.max(dim=1)

            if max_iou > iou_threshold and pred_labels[i] == gt_labels[max_idx]:
                tp += 1
                iou_scores.append(max_iou.item())
            else:
                fp += 1

        fn += len(gt_boxes) - tp

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    mean_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0

    return {"precision": precision, "recall": recall, "mean_iou": mean_iou}

# Run inference on the test dataset
inference_results = run_inference(model, test_loader)

# Calculate performance metrics
metrics = calculate_metrics(inference_results["predictions"], inference_results["ground_truths"])

# Print metrics
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"Mean IoU: {metrics['mean_iou']:.4f}")

# Fine-Tuning a YOLO Model
Load a pre-trained YOLOv5 model and fine-tune it on the same synthetic dataset. Configure the model for the specific detection task with the appropriate number of classes.

In [None]:
# Install YOLOv5 dependencies
!pip install -q ultralytics

# Import YOLOv5 from ultralytics
from ultralytics import YOLO

# Load the pre-trained YOLOv5 model
yolo_model = YOLO('yolov5s.pt')  # Use the small YOLOv5 model for fine-tuning

# Update the model configuration for the custom dataset
yolo_model.model.nc = len(class_names)  # Set the number of classes
yolo_model.model.names = class_names  # Set the class names

# Define paths for training, validation, and test datasets
train_data_path = os.path.join(dataset_dir, "train")
val_data_path = os.path.join(dataset_dir, "val")
test_data_path = os.path.join(dataset_dir, "test")

# Fine-tune the YOLOv5 model on the custom dataset
yolo_model.train(
    data={
        'train': train_data_path,
        'val': val_data_path,
        'names': class_names
    },
    epochs=30,  # Number of epochs for fine-tuning
    batch=16,  # Batch size
    imgsz=640,  # Image size
    device=device,  # Use GPU if available
    project='yolo_finetune',  # Directory to save results
    name='custom_yolo'  # Name of the experiment
)

# Evaluate the fine-tuned YOLOv5 model on the test dataset
results = yolo_model.val(data=test_data_path, imgsz=640, batch=16, device=device)

# Print evaluation metrics
print(f"Precision: {results['metrics']['precision']:.4f}")
print(f"Recall: {results['metrics']['recall']:.4f}")
print(f"mAP@0.5: {results['metrics']['map50']:.4f}")
print(f"mAP@0.5:0.95: {results['metrics']['map']:.4f}")

# Visualize predictions on a batch of test images
test_images = [os.path.join(test_data_path, "images", f) for f in os.listdir(os.path.join(test_data_path, "images"))]
predictions = yolo_model.predict(source=test_images[:8], imgsz=640, conf=0.25, device=device)

# Display predictions
for pred in predictions:
    pred.plot()  # Visualize predictions with bounding boxes and labels

# Evaluating the Models
Compare the performance of both models (custom and YOLO) using standard metrics like mAP, precision, recall, and F1-score. Visualize detection results from both models on test images.

In [None]:
from sklearn.metrics import precision_recall_fscore_support, average_precision_score
from torchvision.ops import box_iou

def evaluate_model(model, dataloader, class_names, iou_threshold=0.5, device=device):
    """
    Evaluate the performance of the object detection model using standard metrics.

    Args:
        model (nn.Module): Trained object detection model.
        dataloader (DataLoader): DataLoader for the test dataset.
        class_names (list): List of class names.
        iou_threshold (float): IoU threshold for matching predictions to ground truths.
        device (torch.device): Device to run inference on.

    Returns:
        dict: Dictionary containing precision, recall, F1-score, mAP, and IoU metrics.
    """
    model.eval()
    all_preds = []
    all_targets = []
    iou_scores = []

    with torch.no_grad():
        for images, bboxes, class_ids in tqdm(dataloader, desc="Evaluating"):
            images = images.to(device)
            bboxes = bboxes.to(device)
            class_ids = class_ids.to(device)

            # Forward pass
            class_logits, bbox_preds = model(images)
            class_probs = torch.softmax(class_logits, dim=1)

            for i in range(images.size(0)):
                # Extract predictions for the current image
                probs = class_probs[i]
                preds = bbox_preds[i]

                # Match predictions to ground truths using IoU
                ious = box_iou(preds, bboxes[i].unsqueeze(0))
                matched = ious > iou_threshold

                # Store predictions and ground truths
                all_preds.append((probs.argmax(dim=0).item(), preds))
                all_targets.append((class_ids[i].item(), bboxes[i]))

                # Calculate IoU for matched boxes
                if matched.any():
                    iou_scores.append(ious[matched].mean().item())

    # Calculate precision, recall, F1-score, and mAP
    y_true = [target[0] for target in all_targets]
    y_pred = [pred[0] for pred in all_preds]
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    mean_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0
    map_score = average_precision_score(y_true, y_pred)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "mean_iou": mean_iou,
        "mAP": map_score
    }

# Evaluate the custom model
custom_model_metrics = evaluate_model(model, test_loader, class_names)

# Print evaluation metrics for the custom model
print("Custom Model Evaluation:")
print(f"Precision: {custom_model_metrics['precision']:.4f}")
print(f"Recall: {custom_model_metrics['recall']:.4f}")
print(f"F1-Score: {custom_model_metrics['f1_score']:.4f}")
print(f"Mean IoU: {custom_model_metrics['mean_iou']:.4f}")
print(f"mAP: {custom_model_metrics['mAP']:.4f}")

# Evaluate the YOLO model
yolo_predictions = yolo_model.predict(source=os.path.join(dataset_dir, "test", "images"), imgsz=640, conf=0.25, device=device)
yolo_metrics = yolo_model.val(data=os.path.join(dataset_dir, "test"), imgsz=640, batch=16, device=device)

# Print evaluation metrics for the YOLO model
print("\nYOLO Model Evaluation:")
print(f"Precision: {yolo_metrics['metrics']['precision']:.4f}")
print(f"Recall: {yolo_metrics['metrics']['recall']:.4f}")
print(f"mAP@0.5: {yolo_metrics['metrics']['map50']:.4f}")
print(f"mAP@0.5:0.95: {yolo_metrics['metrics']['map']:.4f}")

# Visualize predictions for both models
def visualize_predictions(model, dataloader, class_names, num_images=4, device=device):
    """
    Visualize predictions from the model on test images.

    Args:
        model (nn.Module): Trained object detection model.
        dataloader (DataLoader): DataLoader for the test dataset.
        class_names (list): List of class names.
        num_images (int): Number of images to visualize.
        device (torch.device): Device to run inference on.
    """
    model.eval()
    images, bboxes, class_ids = next(iter(dataloader))
    images = images[:num_images].to(device)

    with torch.no_grad():
        class_logits, bbox_preds = model(images)
        class_probs = torch.softmax(class_logits, dim=1)

    plt.figure(figsize=(15, 5))
    for i in range(num_images):
        img = images[i].cpu().permute(1, 2, 0).numpy()
        img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
        img = np.clip(img, 0, 1)

        pred_bbox = bbox_preds[i].cpu().numpy()
        pred_class = class_probs[i].argmax().item()

        plt.subplot(1, num_images, i + 1)
        plt.imshow(img)
        plt.title(f"Pred: {class_names[pred_class]}")
        plt.axis("off")

        x_center, y_center, width, height = pred_bbox
        x_min = (x_center - width / 2) * img.shape[1]
        y_min = (y_center - height / 2) * img.shape[0]
        x_max = (x_center + width / 2) * img.shape[1]
        y_max = (y_center + height / 2) * img.shape[0]

        rect = plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min, edgecolor="red", fill=False, linewidth=2)
        plt.gca().add_patch(rect)

    plt.tight_layout()
    plt.show()

# Visualize predictions for the custom model
print("\nCustom Model Predictions:")
visualize_predictions(model, test_loader, class_names)

# Visualize predictions for the YOLO model
print("\nYOLO Model Predictions:")
yolo_predictions = yolo_model.predict(source=os.path.join(dataset_dir, "test", "images"), imgsz=640, conf=0.25, device=device)
for pred in yolo_predictions[:4]:
    pred.plot()