In [1]:
import torch
import torch.nn as nn
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor
from sklearn.metrics import average_precision_score

import torch.utils.data as data_utils

from src.nn.AMIADataset import AMIADataset
from tqdm import tqdm

In [2]:
def create_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

    in_features = model.roi_heads.box_predictor.cls_score.in_features

    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

In [3]:
def compute_iou(pred_boxes, true_box):
    if not isinstance(pred_boxes, torch.Tensor):
        pred_boxes = torch.tensor(pred_boxes, dtype=torch.float32)
    if not isinstance(true_box, torch.Tensor):
        true_box = torch.tensor(true_box, dtype=torch.float32)
    xA = torch.max(pred_boxes[:, 0], true_box[0])
    yA = torch.max(pred_boxes[:, 1], true_box[1])
    xB = torch.min(pred_boxes[:, 2], true_box[2])
    yB = torch.min(pred_boxes[:, 3], true_box[3])

    inter_area = torch.clamp(xB - xA, min=0) * torch.clamp(yB - yA, min=0)
    boxA_area = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
    boxB_area = (true_box[2] - true_box[0]) * (true_box[3] - true_box[1])

    iou = inter_area / (boxA_area + boxB_area - inter_area)
    return iou

In [4]:
def train_model(model, train_loader, optimizer, device):
    model.train()
    for images, targets in tqdm(train_loader):
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    print(f"Training loss: {losses.item():.4f}")


In [5]:
def evaluate_model(model, test_loader, device, iou_threshold=0.4):
    model.eval()
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for images, targets in tqdm(test_loader):
            images = [img.to(device) for img in images]
            outputs = model(images)

            for output, target in zip(outputs, targets):
                pred_boxes = output['boxes'].cpu().numpy()
                pred_scores = output['scores'].cpu().numpy()
                true_boxes = target['boxes'].cpu().numpy()

                matched_scores = []
                for _, true_box in enumerate(true_boxes):
                    ious = compute_iou(pred_boxes, true_box)
                    if len(ious) > 0 and max(ious) >= iou_threshold:
                        matched_scores.append(pred_scores[ious.argmax()])
                    else:
                        matched_scores.append(0)

                all_predictions.extend(matched_scores)
                all_targets.extend([1] * len(true_boxes))  # True positives

            all_predictions.extend(pred_scores)  # False positives
            all_targets.extend([0] * len(pred_scores))  # Negatives

    # Compute mAP
    mAP = average_precision_score(all_targets, all_predictions)
    print(f"Mean Average Precision (mAP) at IoU > {iou_threshold}: {mAP:.4f}")
    return mAP

In [6]:
num_classes = len(AMIADataset().label_map) + 1  # 14 classes + 1 background class

model = create_model(num_classes=num_classes)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [7]:
data_transform = Compose([ToTensor()])

In [7]:
train_dataset = AMIADataset(data_folder="data", transform=data_transform)
test_dataset = AMIADataset(data_folder="data", transform=data_transform)

In [8]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 10
best_map = 0
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_model(model, train_loader, optimizer, device)
    map = evaluate_model(model, test_loader, device)
    if best_map < map:
        best_map = map
        torch.save(model.state_dict(), "best_model.pth")

Epoch 1/10


100%|██████████| 2144/2144 [11:24<00:00,  3.13it/s]


Training loss: 0.1450


100%|██████████| 2144/2144 [07:05<00:00,  5.04it/s]


Mean Average Precision (mAP) at IoU > 0.4: 0.9046
Epoch 2/10


100%|██████████| 2144/2144 [10:53<00:00,  3.28it/s]


Training loss: 0.1087


100%|██████████| 2144/2144 [06:59<00:00,  5.11it/s]


Mean Average Precision (mAP) at IoU > 0.4: 0.9038
Epoch 3/10


100%|██████████| 2144/2144 [10:52<00:00,  3.28it/s]


Training loss: 8.8073


100%|██████████| 2144/2144 [06:58<00:00,  5.12it/s]


Mean Average Precision (mAP) at IoU > 0.4: 0.8808
Epoch 4/10


100%|██████████| 2144/2144 [10:52<00:00,  3.29it/s]


Training loss: 0.1869


100%|██████████| 2144/2144 [07:09<00:00,  4.99it/s]


Mean Average Precision (mAP) at IoU > 0.4: 0.8799
Epoch 5/10


100%|██████████| 2144/2144 [11:00<00:00,  3.24it/s]


Training loss: 0.6075


100%|██████████| 2144/2144 [07:14<00:00,  4.94it/s]


Mean Average Precision (mAP) at IoU > 0.4: 0.7322
Epoch 6/10


  5%|▍         | 105/2144 [00:33<10:46,  3.15it/s]


KeyboardInterrupt: 

In [8]:
model.load_state_dict(torch.load("best_model.pth", weights_only=True))
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [9]:
val_dataset = AMIADataset(data_folder="data", transform=data_transform, train=False)

In [10]:
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [40]:
model.eval()
all_predictions = []
all_targets = []

result_dict = dict()

with torch.no_grad():
    for images, images_uuid in tqdm(val_loader):
        images = [img.to(device) for img in images]
        outputs = model(images)
        for k, i in enumerate(outputs):
            if i["scores"].shape != torch.Size([0]):
                label = str(i["labels"][torch.argmax(i["scores"])].cpu().item())
                score = str(i["scores"][torch.argmax(i["scores"])].cpu().item())
                bbox = " ".join(map(str, i["boxes"][torch.argmax(i["scores"])].cpu().numpy().tolist()))
                predicted_string = f"{label} {score} {bbox}"
            else:
                predicted_string = "14 1.0 0 0 1 1"
            result_dict[images_uuid[k]] = predicted_string
            # print(images_uuid[k], predicted_string)
        # break

100%|██████████| 1607/1607 [05:50<00:00,  4.59it/s]


In [41]:
import pandas as pd

In [42]:
submission = pd.read_csv("data/sample_submission.csv")

In [44]:
submission["PredictionString"] = submission["image_id"].apply(lambda x: result_dict[x])

In [46]:
submission.to_csv("submission.csv", index=False)