The following jupyter notebook was created using the following websites:
- https://rumn.medium.com/custom-pytorch-image-classifier-from-scratch-d7b3c50f9fbe
- https://github.com/lettuceDestroyer/image_classifier
- https://medium.com/@RobuRishabh/understanding-and-implementing-faster-r-cnn-248f7b25ff96

# Imports

In [None]:
import glob
import os
from tqdm import tqdm
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import matplotlib.image
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import PIL

from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.utils import draw_bounding_boxes

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.io import decode_image, ImageReadMode

# Variables

In [None]:
ROOT_FOLDER_PATH = "C:\\Users\\tobil\\Downloads\\archive"
# Number of labels (your dataset labels + 1 for background)
NUMBER_OF_LABELS = 2
IMAGE_WIDTH = 800
IMAGE_HEIGHT = 640

In [None]:
def resize_image_and_bbox(img, bboxes, new_height, new_width):
    original_width, original_height = img.size
    
    height_factor = original_height / new_height
    width_factor = original_width / new_width

    transform = transforms.Compose([
        transforms.Resize((new_height, new_width)),
        transforms.ToTensor()
    ]) 
    img_as_tensor = transform(img)

    # Scale x coordinates
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] / width_factor
    # Scale y coordinates
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] / height_factor

    return img_as_tensor, bboxes

# Datasets and Dataloaders

In [None]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.5)
])

# Model Definition

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)

# Get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Replace the head of the model with a new one (for the number of labels in your dataset)
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUMBER_OF_LABELS)

In [None]:
import PIL.Image


class CustomDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.class_lbl = "hand"
        self.label_paths = []
        self.label_paths += glob.glob(os.path.join(root_dir, "labels", "VOC", "*.xml"))

    def __len__(self):
        return len(self.label_paths)

    def __getitem__(self, index):
        tree = ET.parse(self.label_paths[index])
        root = tree.getroot()
        img_path = os.path.join(self.root_dir,"images", root.find("path").text.split("\\")[-1])
        img = PIL.Image.open(img_path)
        # img = decode_image(img_path, ImageReadMode.RGB).numpy(force=True)
        xmin = float(root.find("object/bndbox/xmin").text)
        ymin = float(root.find("object/bndbox/ymin").text)
        xmax = float(root.find("object/bndbox/xmax").text)
        ymax = float(root.find("object/bndbox/ymax").text)

        labels = torch.tensor([1])
        bboxes = torch.tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32)

        img_as_tensor, bboxes = resize_image_and_bbox(img, bboxes, IMAGE_HEIGHT, IMAGE_WIDTH)

        target = {}
        target["boxes"] = bboxes
        target["labels"] = labels

        return img_as_tensor, target

In [None]:
#dataset = CustomDataset(ROOT_FOLDER_PATH, transform)
train_set = CustomDataset(os.path.join(ROOT_FOLDER_PATH, "train"))
test_set = CustomDataset(os.path.join(ROOT_FOLDER_PATH, "test"))

In [None]:
dataloaders = {
    "train": DataLoader(train_set, batch_size=8, shuffle=True),
    "test": DataLoader(test_set, batch_size=8, shuffle=True),
}

# Training

In [None]:
EPOCHS = 10
NUM_CLASSES = 1

In [None]:
# Set up the optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
# Train the model
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0

   # Training loops
    for images, targets in dataloaders["train"]:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass
        losses.backward()
        optimizer.step()
        train_loss += losses.item()

    # Update the learning rate
    lr_scheduler.step()
    print(f'Epoch: {epoch + 1}, Loss: {train_loss / len(dataloaders["train"])}')
print("Training complete!")

# Testing