In [1]:
import torch
import torchvision
import torchvision.transforms as T
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
dataset = torchvision.datasets.VOCDetection(
    root='./data',
    year='2012',
    image_set='train',
    download=True,
    transform=T.ToTensor()
)


Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar to ./data/VOCtrainval_11-May-2012.tar


100%|██████████| 1999639040/1999639040 [00:46<00:00, 43170905.08it/s]


Extracting ./data/VOCtrainval_11-May-2012.tar to ./data


In [4]:
import os

In [5]:
# Set to store unique class labels
class_set = set()

# Iterate through the dataset
for _, target in dataset:
    for obj in target['annotation']['object']:
        class_set.add(obj['name'])

# Get the number of unique classes
num_classes = len(class_set)
print(f"Number of different classes in the dataset: {num_classes}")
print("Classes:", class_set)


Number of different classes in the dataset: 20
Classes: {'diningtable', 'chair', 'tvmonitor', 'cow', 'sofa', 'bicycle', 'bird', 'motorbike', 'sheep', 'aeroplane', 'person', 'train', 'car', 'pottedplant', 'horse', 'bus', 'cat', 'bottle', 'dog', 'boat'}


In [6]:
class_counts = {}
for _, target in dataset:
    for obj in target['annotation']['object']:
        class_name = obj['name']
        if class_name not in class_counts:
            class_counts[class_name] = 0
        class_counts[class_name] += 1

# Sort the classes by count in descending order and get the top 5
top_5_classes = sorted(class_counts.items(), key=lambda item: item[1], reverse=True)[:5]

print("Top 5 classes with highest image count:")
for class_name, count in top_5_classes:
    print(f"{class_name}: {count}")

Top 5 classes with highest image count:
person: 5019
chair: 1457
car: 1191
dog: 768
bottle: 749


In [7]:
# List of classes to keep
selected_classes = ["person", "chair", "car", "dog", "bottle"]

# Create a new list to store the filtered dataset
filtered_dataset = []

# Iterate through the original dataset
for image, target in dataset:
    keep_image = False
    for obj in target['annotation']['object']:
        if obj['name'] in selected_classes:
            keep_image = True
            break
    if keep_image:
        filtered_dataset.append((image, target))

# Print the length of the filtered dataset
print(f"Length of filtered dataset: {len(filtered_dataset)}")

Length of filtered dataset: 3442


In [8]:
filtered_dataset

[(tensor([[[1.0000, 1.0000, 1.0000,  ..., 0.7922, 0.7961, 0.8000],
           [1.0000, 1.0000, 1.0000,  ..., 0.7961, 0.7961, 0.7922],
           [1.0000, 1.0000, 1.0000,  ..., 0.8078, 0.8039, 0.8039],
           ...,
           [0.8039, 0.7608, 0.8078,  ..., 0.5373, 0.4706, 0.4667],
           [0.7529, 0.7569, 0.7569,  ..., 0.4353, 0.4235, 0.4784],
           [0.6980, 0.7059, 0.7255,  ..., 0.3216, 0.2863, 0.3137]],
  
          [[1.0000, 1.0000, 1.0000,  ..., 0.8039, 0.8078, 0.8118],
           [1.0000, 1.0000, 1.0000,  ..., 0.8078, 0.8078, 0.8039],
           [1.0000, 1.0000, 1.0000,  ..., 0.8196, 0.8157, 0.8157],
           ...,
           [0.7490, 0.7059, 0.7529,  ..., 0.5020, 0.4392, 0.4431],
           [0.6980, 0.7020, 0.7020,  ..., 0.4157, 0.4039, 0.4667],
           [0.6431, 0.6510, 0.6706,  ..., 0.3020, 0.2667, 0.3020]],
  
          [[1.0000, 1.0000, 1.0000,  ..., 0.9725, 0.9765, 0.9804],
           [1.0000, 1.0000, 1.0000,  ..., 0.9765, 0.9765, 0.9725],
           [1.0000, 1.

In [9]:
for i, img in enumerate(filtered_dataset) :
  print(img[0])
  if i > 2:
    break

tensor([[[1.0000, 1.0000, 1.0000,  ..., 0.7922, 0.7961, 0.8000],
         [1.0000, 1.0000, 1.0000,  ..., 0.7961, 0.7961, 0.7922],
         [1.0000, 1.0000, 1.0000,  ..., 0.8078, 0.8039, 0.8039],
         ...,
         [0.8039, 0.7608, 0.8078,  ..., 0.5373, 0.4706, 0.4667],
         [0.7529, 0.7569, 0.7569,  ..., 0.4353, 0.4235, 0.4784],
         [0.6980, 0.7059, 0.7255,  ..., 0.3216, 0.2863, 0.3137]],

        [[1.0000, 1.0000, 1.0000,  ..., 0.8039, 0.8078, 0.8118],
         [1.0000, 1.0000, 1.0000,  ..., 0.8078, 0.8078, 0.8039],
         [1.0000, 1.0000, 1.0000,  ..., 0.8196, 0.8157, 0.8157],
         ...,
         [0.7490, 0.7059, 0.7529,  ..., 0.5020, 0.4392, 0.4431],
         [0.6980, 0.7020, 0.7020,  ..., 0.4157, 0.4039, 0.4667],
         [0.6431, 0.6510, 0.6706,  ..., 0.3020, 0.2667, 0.3020]],

        [[1.0000, 1.0000, 1.0000,  ..., 0.9725, 0.9765, 0.9804],
         [1.0000, 1.0000, 1.0000,  ..., 0.9765, 0.9765, 0.9725],
         [1.0000, 1.0000, 1.0000,  ..., 0.9882, 0.9843, 0.

In [10]:
a = [x[1] for x in filtered_dataset[:12]]

In [11]:
a[0]

{'annotation': {'folder': 'VOC2012',
  'filename': '2008_000008.jpg',
  'source': {'database': 'The VOC2008 Database',
   'annotation': 'PASCAL VOC2008',
   'image': 'flickr'},
  'size': {'width': '500', 'height': '442', 'depth': '3'},
  'segmented': '0',
  'object': [{'name': 'horse',
    'pose': 'Left',
    'truncated': '0',
    'occluded': '1',
    'bndbox': {'xmin': '53', 'ymin': '87', 'xmax': '471', 'ymax': '420'},
    'difficult': '0'},
   {'name': 'person',
    'pose': 'Unspecified',
    'truncated': '1',
    'occluded': '0',
    'bndbox': {'xmin': '158', 'ymin': '44', 'xmax': '289', 'ymax': '167'},
    'difficult': '0'}]}}

In [12]:
imgs = list(x[0] for x in filtered_dataset)
targets = list(x[1] for x in filtered_dataset)
objects = [x['annotation']['object'] for x in targets]

In [13]:
objects[0]

[{'name': 'horse',
  'pose': 'Left',
  'truncated': '0',
  'occluded': '1',
  'bndbox': {'xmin': '53', 'ymin': '87', 'xmax': '471', 'ymax': '420'},
  'difficult': '0'},
 {'name': 'person',
  'pose': 'Unspecified',
  'truncated': '1',
  'occluded': '0',
  'bndbox': {'xmin': '158', 'ymin': '44', 'xmax': '289', 'ymax': '167'},
  'difficult': '0'}]

In [14]:
bboxes = [[x[i]['bndbox'] for i in range(len(x))] for x in objects]

In [15]:
names = [[x[i]['name'] for i in range(len(x))] for x in objects]

In [16]:
names[15]

['dog', 'sheep']

In [17]:
bboxes[15]

[{'xmin': '122', 'ymin': '11', 'xmax': '336', 'ymax': '223'},
 {'xmin': '98', 'ymin': '176', 'xmax': '425', 'ymax': '357'}]

In [20]:
import os
import torch

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F


class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transforms):
        self.dataset = dataset
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(x[0] for x in self.dataset)
        self.targets = list(x[1] for x in self.dataset)
        self.objects = [x['annotation']['object'] for x in self.targets]
        self.annotations = [[x[i]['bndbox'] for i in range(len(x))] for x in self.objects]
        self.classes = [[x[i]['name'] for i in range(len(x))] for x in self.objects]

    def __getitem__(self, idx):
        # load images and masks
        img = self.imgs[idx]
        # first id is the background, so remove it
        num_objs = len(self.objects[idx])



        # get bounding box coordinates for each mask
        boxes_list =  boxes_list = [[int(box['xmin']), int(box['ymin']), int(box['xmax']), int(box['ymax'])] for box in self.annotations[idx]]
        boxes = torch.as_tensor(boxes_list, dtype=torch.float32) # Now convert the list of lists to a tensor




        image_id = idx
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        class_to_idx = {'person': 0, 'chair': 1, 'car': 2, 'dog': 3, 'bottle': 4,'tvmonitor' :5, 'aeroplane':6, 'cat':7, 'sheep':8, 'motorbike':9, 'sofa':10, 'pottedplant':11, 'horse':12, 'car':13, 'bicycle':14, 'cow':15, 'train':16, 'bus':17, 'boat':18, 'diningtable':19, 'bird':20}
        labels =self.classes[idx]
        labels = [class_to_idx[label] for label in labels]
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [21]:
from torchvision.transforms import v2 as T


def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

In [22]:
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py")
os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py")

--2024-08-22 14:52:54--  https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4063 (4.0K) [text/plain]
Saving to: 'engine.py'

     0K ...                                                   100% 27.3M=0s

2024-08-22 14:52:54 (27.3 MB/s) - 'engine.py' saved [4063/4063]

--2024-08-22 14:52:54--  https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8388 (8.2K) [text/plain]
Savin

0

In [23]:
import utils

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
dataset = PennFudanDataset(filtered_dataset, get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=utils.collate_fn
)

# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(output)

# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions
print(predictions[0])

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:00<00:00, 180MB/s]  


{'loss_classifier': tensor(0.3232, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.0396, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0007, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0286, grad_fn=<DivBackward0>)}
{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>)}


In [24]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 21  # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [25]:
pip install pycocotools


Collecting pycocotools
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.8
Note: you may need to restart the kernel to use updated packages.


In [27]:
from engine import train_one_epoch, evaluate

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# use our dataset and defined transformations
dataset = PennFudanDataset(filtered_dataset, get_transform(train=True))
dataset_test = PennFudanDataset(filtered_dataset , get_transform(train=False))

# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=utils.collate_fn
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    collate_fn=utils.collate_fn
)



# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it just for 2 epochs
num_epochs = 50

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("That's it!")

Epoch: [0]  [   0/1696]  eta: 0:10:17  lr: 0.000010  loss: 0.1733 (0.1733)  loss_classifier: 0.0482 (0.0482)  loss_box_reg: 0.0618 (0.0618)  loss_objectness: 0.0164 (0.0164)  loss_rpn_box_reg: 0.0468 (0.0468)  time: 0.3643  data: 0.0030  max mem: 4303
Epoch: [0]  [  10/1696]  eta: 0:09:08  lr: 0.000060  loss: 0.1386 (0.2086)  loss_classifier: 0.0599 (0.0836)  loss_box_reg: 0.0506 (0.0920)  loss_objectness: 0.0060 (0.0109)  loss_rpn_box_reg: 0.0194 (0.0222)  time: 0.3253  data: 0.0019  max mem: 4303
Epoch: [0]  [  20/1696]  eta: 0:08:52  lr: 0.000110  loss: 0.1541 (0.2590)  loss_classifier: 0.0666 (0.1074)  loss_box_reg: 0.0699 (0.1188)  loss_objectness: 0.0054 (0.0111)  loss_rpn_box_reg: 0.0149 (0.0217)  time: 0.3154  data: 0.0021  max mem: 4303
Epoch: [0]  [  30/1696]  eta: 0:08:45  lr: 0.000160  loss: 0.2089 (0.2632)  loss_classifier: 0.0767 (0.1061)  loss_box_reg: 0.0905 (0.1191)  loss_objectness: 0.0082 (0.0141)  loss_rpn_box_reg: 0.0131 (0.0239)  time: 0.3104  data: 0.0023  max me

In [31]:
import torch

# Save the entire model
torch.save(model, '/kaggle/working/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth')


In [32]:
import torch

# Save the model's state dictionary
torch.save(model.state_dict(), '/kaggle/working/frcnn_model_state_dict.pth')
