In [1]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
import os
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
from torch import optim
import numpy as np
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpn


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil

src = '/content/drive/MyDrive/tmp'
dst = '/content/tmp'

subdirs_to_copy = ['dataset/filtered_labels', 'dataset/sobel_images']

for subdir in subdirs_to_copy:
    full_src_path = os.path.join(src, subdir)
    full_dst_path = os.path.join(dst, subdir)
    if os.path.exists(full_src_path):
        shutil.copytree(full_src_path, full_dst_path)
    else:
        print(f"Warning: {full_src_path} does not exist")


In [4]:
class YoloDataset(Dataset):
    def __init__(self, images_dir, labels_dir, transforms=None):
        self.images_dir = images_dir
        self.labels_dir = labels_dir
        self.transforms = transforms
        self.images = []
        self.labels = []
        for f in os.listdir(labels_dir):
            image_file = f.replace('.txt', '.jpg')
            if not os.path.exists(os.path.join(images_dir, image_file)):
                continue
            if f.endswith('.txt'):
                with open(os.path.join(labels_dir, f), 'r') as file:
                    lines = file.readlines()
                    if lines:
                        self.labels.append(f)
                        self.images.append(image_file)


    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):

        img_name = self.images[idx]
        img_path = os.path.join(self.images_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        w, h = image.size

        label_path = os.path.join(self.labels_dir, img_name.replace('.jpg', '.txt'))
        boxes = []
        labels = []

        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f.readlines():
                    parts = line.strip().split()
                    class_id = int(parts[0])
                    x_center = float(parts[1]) * w
                    y_center = float(parts[2]) * h
                    width = float(parts[3]) * w
                    height = float(parts[4]) * h

                    x_min = x_center - width / 2
                    y_min = y_center - height / 2
                    x_max = x_center + width / 2
                    y_max = y_center + height / 2

                    boxes.append([x_min, y_min, x_max, y_max])
                    labels.append(class_id)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transforms:
            image = self.transforms(image)

        return image, target

In [9]:
from torchvision.transforms import ToTensor
dataset_train = YoloDataset(
    images_dir='tmp/dataset/sobel_images/train',
    labels_dir='tmp/dataset/filtered_labels/train',
    transforms=ToTensor()
)

dataest_val = YoloDataset(
    images_dir='tmp/dataset/sobel_images/val',
    labels_dir='tmp/dataset/filtered_labels/val',
    transforms=ToTensor()
)

dataset_test = YoloDataset(
    images_dir='tmp/dataset/sobel_images/test',
    labels_dir='tmp/dataset/filtered_labels/test',
    transforms=ToTensor()
)


from torch.utils.data import DataLoader

data_loader_train = DataLoader(dataset_train, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
data_loader_val = DataLoader(dataest_val, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
data_loader_test = DataLoader(dataset_test, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Device:", device)


model = fasterrcnn_mobilenet_v3_large_fpn(weights=None)

num_classes = 3 + 1

in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

Device: cuda


Downloading: "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_large-8738ca79.pth
100%|██████████| 21.1M/21.1M [00:00<00:00, 189MB/s]


In [11]:
num_epochs = 100
patiance = 10

no_imporvement = 0
best_loss = float('inf')
for epoch in range(num_epochs):
    running_loss = 0.0

    model.train()
    for images, targets in data_loader_train:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        running_loss += losses.item()


    validation_loss = 0.0
    for images, targets in data_loader_val:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with torch.no_grad():
          loss_dict = model(images, targets)
          losses = sum(loss for loss in loss_dict.values())

          validation_loss += losses.item()

    if validation_loss < best_loss:
        best_loss = validation_loss
        no_imporvement = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        no_imporvement += 1

    if no_imporvement >= patiance:
        print("Early stopping")
        break



    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {running_loss/len(data_loader_train)}")
    print(f"Validation Loss: {validation_loss/len(data_loader_val)}")

Epoch 1/100
Training Loss: 0.44264281092584135
Validation Loss: 0.36992970957524246
Epoch 2/100
Training Loss: 0.36896514397114516
Validation Loss: 0.3447601114296251
Epoch 3/100
Training Loss: 0.3426196135878563
Validation Loss: 0.34567442039648694
Epoch 4/100
Training Loss: 0.3291363442353904
Validation Loss: 0.3240262103370494
Epoch 5/100
Training Loss: 0.32135506501048805
Validation Loss: 0.3420920108134548
Epoch 6/100
Training Loss: 0.31490916415676473
Validation Loss: 0.32315970077696776
Epoch 7/100
Training Loss: 0.3086947965733707
Validation Loss: 0.35287069405118626
Epoch 8/100
Training Loss: 0.3017829825207591
Validation Loss: 0.3601566381855971
Epoch 9/100
Training Loss: 0.3010285359248519
Validation Loss: 0.3388980811772247
Epoch 10/100
Training Loss: 0.29525313794612884
Validation Loss: 0.3264602371491492
Epoch 11/100
Training Loss: 0.2998281901963055
Validation Loss: 0.41495481402509743


KeyboardInterrupt: 

In [12]:
torch.cuda.empty_cache()
model.load_state_dict(torch.load('best_model.pth'))
model.to(device)
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
          )
        )
      )
      (2): InvertedResidual(
        (block):

In [14]:
!pip install torchmetrics

Collecting torchmetrics
  Using cached torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Using cached lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->torchmetrics)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=2.0.0->torchmetrics)
  Using cached nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached torchmetrics-1.7.1-py3-none-any.whl (961 kB)
Using cached lightning_utilit

In [15]:
from torchmetrics.detection.mean_ap import MeanAveragePrecision

mAP = MeanAveragePrecision()
with torch.no_grad():
    for images, targets in data_loader_test:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        predictions = model(images)

        mAP.update(predictions, targets)

mAP.compute()


{'map': tensor(0.2327),
 'map_50': tensor(0.4937),
 'map_75': tensor(0.1885),
 'map_small': tensor(0.1065),
 'map_medium': tensor(0.2985),
 'map_large': tensor(0.5979),
 'mar_1': tensor(0.0582),
 'mar_10': tensor(0.2341),
 'mar_100': tensor(0.3058),
 'mar_small': tensor(0.2004),
 'mar_medium': tensor(0.3644),
 'mar_large': tensor(0.7000),
 'map_per_class': tensor(-1.),
 'mar_100_per_class': tensor(-1.),
 'classes': tensor([0, 1, 2], dtype=torch.int32)}