In [None]:
!pip install -q clearml torchmetrics pillow

In [None]:
import os

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ['CLEARML_API_ACCESS_KEY'] = user_secrets.get_secret("CLEARML_API_ACCESS_KEY")
os.environ['CLEARML_API_SECRET_KEY'] = user_secrets.get_secret("CLEARML_API_SECRET_KEY")
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml

In [None]:
import os
import random
import time
import xml.etree.ElementTree as ET
from types import SimpleNamespace

import torch
from clearml import Task
from PIL import Image, ImageDraw
from torch import nn
from torch.optim import SGD
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchvision.models import ResNet50_Weights
from torchvision.models.detection import retinanet_resnet50_fpn
from torchvision.transforms import ColorJitter, InterpolationMode, Normalize, ToTensor
from torchvision.transforms import functional as F
from tqdm import tqdm

In [None]:
args = SimpleNamespace(
    # Paths
    voc_root="/kaggle/input/pascal-voc-2012/VOC2012",
    base_output_dir="/kaggle/working/runs",
    train_set="train",
    val_set="val",
    resume_checkpoint=None,

    # Training hyperparameters
    epochs=50,
    batch_size=4,
    lr=1e-3,
    weight_decay=1e-4,
    momentum=0.9,

    # Image resize / augmentation
    min_short_size=400,
    max_short_size=800,
    max_long_size=1333,
    val_short_size=600,
    val_long_size=1000,

    # Data loader
    workers=2,

    # Model
    trainable_backbone_layers=3,
)

os.makedirs(args.base_output_dir, exist_ok=True)

existing = [
    d for d in os.listdir(args.base_output_dir)
    if os.path.isdir(os.path.join(args.base_output_dir, d)) and d.isdigit()
]

nums = sorted(int(d) for d in existing)
next_num = nums[-1] + 1 if nums else 1

new_output = os.path.join(args.base_output_dir, str(next_num))
os.makedirs(new_output, exist_ok=False)
args.output_dir = new_output

print(f"Writing run outputs to: {args.output_dir}")

In [None]:
experiment_tags = [
    "model_name:retinanet",
    "dataset:voc2012",
    "platform:kaggle",
    "author:hussain",
    "account:hussainsyed.dev@gmail.com",
    "training"
]

task = Task.init(
    project_name="CMT318-Object-Detection",
    task_name="RetinaNet_Training",
    tags=experiment_tags,
    reuse_last_task_id=False
)
task.connect(vars(args))
logger = task.get_logger()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
CLASS_NAMES = [
    'aeroplane','bicycle','bird','boat','bottle',
    'bus','car','cat','chair','cow',
    'diningtable','dog','horse','motorbike','person',
    'pottedplant','sheep','sofa','train','tvmonitor'
]
CLASS_NAME_TO_IDX = {name: i+1 for i, name in enumerate(CLASS_NAMES)}
NUM_CLASSES = len(CLASS_NAMES) + 1  # +1 for background

In [None]:
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]
color_aug = ColorJitter(0.3, 0.3, 0.3, 0.1)

def get_train_transform(min_s, max_short, max_long, mean, std):
    def transform(img, target):
        w, h = img.size
        short = random.randint(min_s, max_short)
        if w < h:
            new_w, new_h = short, int(short * h / w)
        else:
            new_h, new_w = short, int(short * w / h)
        long_dim = max(new_w, new_h)
        if long_dim > max_long:
            scale = max_long / long_dim
            new_w, new_h = int(new_w * scale), int(new_h * scale)

        img = F.resize(img, [new_h, new_w], interpolation=InterpolationMode.BILINEAR)
        sx, sy = new_w / w, new_h / h

        boxes = target['boxes'].clone()
        boxes[:, [0,2]] *= sx
        boxes[:, [1,3]] *= sy

        img = color_aug(img)
        if random.random() < 0.5:
            img = F.hflip(img)
            x1, x2 = boxes[:,0].clone(), boxes[:,2].clone()
            boxes[:,0] = new_w - x2
            boxes[:,2] = new_w - x1

        boxes[:, [0,2]].clamp_(0, new_w)
        boxes[:, [1,3]].clamp_(0, new_h)
        keep = (boxes[:,2] - boxes[:,0] > 0) & (boxes[:,3] - boxes[:,1] > 0)
        boxes = boxes[keep]

        target['boxes'] = boxes
        target['labels'] = target['labels'][keep]

        img = ToTensor()(img)
        img = Normalize(mean, std)(img)
        return img, target

    return transform

def get_val_transform(short_size, max_long, mean, std):
    def transform(img, target):
        w, h = img.size
        if w < h:
            new_w, new_h = short_size, int(short_size * h / w)
        else:
            new_h, new_w = short_size, int(short_size * w / h)
        long_dim = max(new_w, new_h)
        if long_dim > max_long:
            scale = max_long / long_dim
            new_w, new_h = int(new_w * scale), int(new_h * scale)

        img = F.resize(img, [new_h, new_w], interpolation=InterpolationMode.BILINEAR)
        sx, sy = new_w / w, new_h / h

        boxes = target['boxes'].clone()
        boxes[:, [0,2]] *= sx
        boxes[:, [1,3]] *= sy
        boxes[:, [0,2]].clamp_(0, new_w)
        boxes[:, [1,3]].clamp_(0, new_h)
        keep = (boxes[:,2] - boxes[:,0] > 0) & (boxes[:,3] - boxes[:,1] > 0)
        boxes = boxes[keep]

        target['boxes'] = boxes
        target['labels'] = target['labels'][keep]

        img = ToTensor()(img)
        img = Normalize(mean, std)(img)
        return img, target

    return transform

In [None]:
class VOCDataset(Dataset):
    def __init__(self, root, image_set, transforms=None):
        self.root = root
        ids_file = os.path.join(root, "ImageSets", "Main", f"{image_set}.txt")
        with open(ids_file) as f:
            img_ids = [l.strip() for l in f if l.strip()]

        self.annotations = []
        skipped_images = 0
        for img_id in img_ids:
            xml_path = os.path.join(root, "Annotations", f"{img_id}.xml")
            tree = ET.parse(xml_path)
            boxes, labels = [], []
            for obj in tree.getroot().findall("object"):
                cls = obj.find("name").text
                idx = CLASS_NAME_TO_IDX.get(cls)
                if idx is None:
                    continue
                b = obj.find("bndbox")
                coords = [
                    float(b.find("xmin").text),
                    float(b.find("ymin").text),
                    float(b.find("xmax").text),
                    float(b.find("ymax").text),
                ]
                if coords[2] <= coords[0] or coords[3] <= coords[1]:
                    continue
                boxes.append(coords)
                labels.append(idx)

            if not labels:
                print(f"No labels found for image id: {img_id}")
                skipped_images = skipped_images+1
                continue

            self.annotations.append({
                "id": img_id,
                "boxes": torch.tensor(boxes, dtype=torch.float32),
                "labels": torch.tensor(labels, dtype=torch.int64),
            })
            
        print(f"{image_set}.txt: Skipped {skipped_images} images")
        assert self.annotations, f"No annotations for split {image_set}"
        self.transforms = transforms

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.root, "JPEGImages", f"{ann['id']}.jpg")
        img = Image.open(img_path).convert("RGB")
        target = {
            "boxes": ann["boxes"].clone(),
            "labels": ann["labels"].clone()
        }
        if self.transforms:
            img, target = self.transforms(img, target)
        return img, target

def collate_fn(batch):
    return tuple(zip(*batch))


In [None]:
train_ds = VOCDataset(
    args.voc_root,
    args.train_set,
    transforms=get_train_transform(
        args.min_short_size, args.max_short_size,
        args.max_long_size, mean, std
    )
)
val_ds = VOCDataset(
    args.voc_root,
    args.val_set,
    transforms=get_val_transform(
        args.val_short_size, args.val_long_size, mean, std
    )
)
print(f"Train examples: {len(train_ds)}, Val examples: {len(val_ds)}")

In [None]:
counts = dict.fromkeys(range(1, NUM_CLASSES), 0)
for ann in train_ds.annotations:
    for lbl in ann["labels"].tolist():
        counts[lbl] += 1
for k, v in counts.items():
    counts[k] = max(1, v)
weights = [
    sum(1.0 / counts[lbl] for lbl in ann["labels"].tolist()) / len(ann["labels"])
    for ann in train_ds.annotations
]
sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

In [None]:
train_loader = DataLoader(
    train_ds,
    batch_size=args.batch_size,
    sampler=sampler,
    num_workers=args.workers,
    pin_memory=torch.cuda.is_available(),
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_ds,
    batch_size=args.batch_size,
    shuffle=False,
    num_workers=args.workers,
    pin_memory=torch.cuda.is_available(),
    collate_fn=collate_fn
)
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

In [None]:
model = retinanet_resnet50_fpn(
    weights=None,
    weights_backbone=ResNet50_Weights.IMAGENET1K_V1,
    num_classes=NUM_CLASSES,
    trainable_backbone_layers=args.trainable_backbone_layers
).to(device)

optimizer = SGD(
    model.parameters(),
    lr=args.lr,
    momentum=args.momentum,
    weight_decay=args.weight_decay
)
scheduler = OneCycleLR(
    optimizer,
    max_lr=args.lr,
    epochs=args.epochs,
    steps_per_epoch=len(train_loader),
    pct_start=0.3,
    div_factor=25
)
scaler = torch.amp.GradScaler(device)

print(f"Starting LR: {scheduler.get_last_lr()[0]:.2e}")

In [None]:
start_epoch = 1
if args.resume_checkpoint and os.path.exists(args.resume_checkpoint):
    ck = torch.load(args.resume_checkpoint, map_location=device)
    model.load_state_dict(ck["model_state_dict"])
    optimizer.load_state_dict(ck["optimizer_state_dict"])
    scheduler.load_state_dict(ck["scheduler_state_dict"])
    scaler.load_state_dict(ck.get("scaler_state_dict", {}))
    start_epoch = ck.get("epoch", 1) + 1
    print(f"Resumed from checkpoint '{args.resume_checkpoint}', starting at epoch {start_epoch}")


def train_one_epoch(model, loader, optimizer, scheduler, epoch, scaler, device):
    model.train()
    total_loss = 0.0
    total_cls_loss = 0.0
    total_box_loss = 0.0

    for imgs, targets in tqdm(loader, desc=f"Train Epoch {epoch}", leave=False):
        imgs = [img.to(device) for img in imgs]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        with torch.amp.autocast(device):
            loss_dict = model(imgs, targets)
            # split out the two losses
            cls_loss = loss_dict['classification']
            box_loss = loss_dict['bbox_regression']
            loss = cls_loss + box_loss

        scaler.scale(loss).backward()
        # (optional) clip gradients here if you wish
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss     += loss.item()
        total_cls_loss += cls_loss.item()
        total_box_loss += box_loss.item()

    n = len(loader)
    avg_loss     = total_loss     / n
    avg_cls_loss = total_cls_loss / n
    avg_box_loss = total_box_loss / n
    lr = scheduler.get_last_lr()[0]

    print(f"Epoch {epoch} | Avg Train Loss: {avg_loss:.4f} | Avg Cls Loss: {avg_cls_loss:.4f} | Avg Box Loss: {avg_box_loss:.4f} | LR: {lr:.2e}")

    logger.report_scalar("Train/Loss/Total",          "epoch", iteration=epoch, value=avg_loss)
    logger.report_scalar("Train/Loss/Classification", "epoch", iteration=epoch, value=avg_cls_loss)
    logger.report_scalar("Train/Loss/BoxRegression",  "epoch", iteration=epoch, value=avg_box_loss)
    logger.report_scalar("Train/LearningRate",        "epoch", iteration=epoch, value=lr)


@torch.no_grad()
def evaluate(model, loader, epoch):
    model.eval()
    metric = MeanAveragePrecision(box_format='xyxy', class_metrics=True)

    total_val_loss     = 0.0
    total_val_cls_loss = 0.0
    total_val_box_loss = 0.0

    for imgs, targets in tqdm(loader, desc=f"Val Epoch {epoch}", leave=False):
        imgs_cuda    = [img.to(device) for img in imgs]
        targets_cuda = [{k: v.to(device) for k, v in t.items()} for t in targets]

        model.train()
        for m in model.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()

        loss_dict = model(imgs_cuda, targets_cuda)
        cls_loss = loss_dict['classification']
        box_loss = loss_dict['bbox_regression']
        batch_loss = (cls_loss + box_loss).item()

        total_val_loss     += batch_loss
        total_val_cls_loss += cls_loss.item()
        total_val_box_loss += box_loss.item()

        model.eval()
        outputs = model(imgs_cuda)
        preds = [
            {k: v.cpu() for k, v in out.items() if k in ('boxes','scores','labels')}
            for out in outputs
        ]
        metric.update(preds, targets)

    n = len(loader)
    avg_val_loss     = total_val_loss     / n
    avg_val_cls_loss = total_val_cls_loss / n
    avg_val_box_loss = total_val_box_loss / n

    results      = metric.compute()
    overall_map  = results["map"].item()
    per_class_map = results["map_per_class"]

    print(f"Epoch {epoch} | Avg Val Loss: {avg_val_loss:.4f} | Avg Cls Loss: {avg_val_cls_loss:.4f} | Avg Box Loss: {avg_val_box_loss:.4f} | mAP: {overall_map:.4f}")

    logger.report_scalar("Val/Loss/Total",          "epoch", iteration=epoch, value=avg_val_loss)
    logger.report_scalar("Val/Loss/Classification", "epoch", iteration=epoch, value=avg_val_cls_loss)
    logger.report_scalar("Val/Loss/BoxRegression",  "epoch", iteration=epoch, value=avg_val_box_loss)
    logger.report_scalar("Val/Detection/mAP",       "epoch", iteration=epoch, value=overall_map)

    for idx, ap in enumerate(per_class_map):
        cls = CLASS_NAMES[idx]
        logger.report_scalar("Val/Detection/AP", cls, iteration=epoch, value=ap.item())

    return avg_val_loss, overall_map


In [None]:
def log_head_histograms(epoch, model):
    for name, param in model.named_parameters():
        if ("head.classification_head" in name) or ("head.regression_head" in name):
            if param.grad is not None:
                logger.report_histogram(
                    title="Gradients",
                    series=name,
                    iteration=epoch,
                    values=param.grad.detach().cpu().numpy().ravel(),
                    data_args={"nbinsx": 50},        # Plotly’s nbinsx parameter
                )
            logger.report_histogram(
                title="Weights",
                series=name,
                iteration=epoch,
                values=param.data.detach().cpu().numpy().ravel(),
                data_args={"nbinsx": 50},
            )


def log_sanity_images(epoch, model, val_loader, args, mean, std):
    model.eval()
    imgs, targets = next(iter(val_loader))
    imgs_cuda = [img.to(device) for img in imgs]
    with torch.no_grad():
        outputs = model(imgs_cuda)

    save_dir = os.path.join(args.output_dir, "sanity_images")
    os.makedirs(save_dir, exist_ok=True)

    for i, (img_tensor, tgt, pred) in enumerate(zip(imgs, targets, outputs)):
        img = img_tensor.clone()
        for c, m, s in zip(img, mean, std):
            c.mul_(s).add_(m)
        arr = (img.permute(1,2,0).cpu().numpy() * 255).astype("uint8")
        pil_img = Image.fromarray(arr)

        draw = ImageDraw.Draw(pil_img)
        for box in tgt["boxes"]:
            x1, y1, x2, y2 = box.tolist()
            draw.rectangle([x1, y1, x2, y2], outline="green", width=2)
        for box in pred["boxes"].cpu():
            x1, y1, x2, y2 = box.tolist()
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

        path = os.path.join(save_dir, f"epoch{epoch}_sample{i}.png")
        pil_img.save(path)
        logger.report_image(
            title="Val Sample Detections",
            series=f"epoch_{epoch}",
            local_path=path,
            iteration=epoch
        )

    model.train()

In [None]:
best_map = 0.0
for epoch in range(start_epoch, args.epochs + 1):
    epoch_start = time.time()
    train_loss = train_one_epoch(
        model, train_loader, optimizer, scheduler, epoch, scaler, device
    )
    
    val_loss, val_map = evaluate(model, val_loader, epoch)
    
    if epoch % 2 == 0:
        log_head_histograms(epoch, model)

    if epoch % 5 == 0:
        log_sanity_images(epoch, model, val_loader, args, mean, std)
    
    epoch_time = time.time() - epoch_start
    logger.report_scalar(
        title="Time/Epoch",
        series="duration_seconds",
        iteration=epoch,
        value=epoch_time
    )
    if val_map > best_map:
        best_map = val_map
        best_ckpt = os.path.join(args.output_dir, "best_model.pth")
        torch.save({
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "scaler_state_dict": scaler.state_dict(),
            "epoch": epoch
        }, best_ckpt)
        print(f"Epoch {epoch}: new best mAP = {best_map:.4f}, saved to {best_ckpt}")
        task.upload_artifact(name="best_model.pth", artifact_object=best_ckpt)

In [None]:
final_path = os.path.join(args.output_dir, "final_model.pth")
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "scaler_state_dict": scaler.state_dict(),
    "epoch": args.epochs
}, final_path)
print(f"Final model weights saved to {final_path}")

best_map_path = os.path.join(args.output_dir, "best_map.txt")
with open(best_map_path, "w") as f:
    f.write(f"best_map: {best_map:.4f}\n")

task.upload_artifact(name="final_model.pth", artifact_object=final_path)
task.upload_artifact(name="best_map.txt", artifact_object=best_map_path)

task.close()
print("Training complete.")