In [None]:
"""""""""""""""""""""""""""""""""

Description:

Running the following scripts will collect our VisDrone data and load it into
the structure outlined below:

VisDrone/
├── VisDrone2019-DET-test-dev/
│   ├── annotations/
│   └── images/
├── VisDrone2019-DET-train/
│   ├── annotations/
│   └── images/
├── VisDrone2019-DET-val/
│   ├── annotations/
│   └── images/
├── VisDrone2019-DET-test-dev.zip
├── VisDrone2019-DET-train.zip
└── VisDrone2019-DET-val.zip


"""""""""""""""""""""""""""""""""

'\n\nDescription:\n\nRunning the following scripts will collect our VisDrone data and load it into\nthe structure outlined below, primarily utilizing gdown:\n\nVisDrone/\n├── VisDrone2019-DET-test-dev/\n│   ├── annotations/\n│   └── images/\n├── VisDrone2019-DET-train/\n│   ├── annotations/\n│   └── images/\n├── VisDrone2019-DET-val/\n│   ├── annotations/\n│   └── images/\n├── VisDrone2019-DET-test-dev.zip\n├── VisDrone2019-DET-train.zip\n└── VisDrone2019-DET-val.zip\n\nWe then perform several high-level checks of paths and contents to confirm the\nexpected structure above is present in the environment.\n\n\n'

In [1]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

In [2]:
# Imports (for version, see requirements.txt)
import os
import torch
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.transforms.functional as F
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as T
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.metrics import precision_score, recall_score, f1_score
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchvision.ops import box_iou, MultiScaleRoIAlign
import torch.nn as nn
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
import torch.optim as optim
import time

In [3]:
# Create Parent Directory
dataset_dir = "/content/VisDrone"
os.makedirs(dataset_dir, exist_ok=True)

In [10]:
# TESTING DATASET
# Download the file from Dropbox
!wget -O VisDrone/VisDrone2019-DET-test-dev.zip "https://www.dropbox.com/scl/fi/yuim21nvv96pdmetf43mu/VisDrone2019-DET-test-dev.zip?rlkey=m4htjb3wjdvjukshn3inh9s7d&st=ajdims0w&dl=1"

# Unzip the file
!unzip -q VisDrone/VisDrone2019-DET-test-dev.zip -d VisDrone/VisDrone2019-DET-test-dev/

--2025-04-27 20:52:47--  https://www.dropbox.com/scl/fi/yuim21nvv96pdmetf43mu/VisDrone2019-DET-test-dev.zip?rlkey=m4htjb3wjdvjukshn3inh9s7d&st=ajdims0w&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.66.18, 2620:100:6022:18::a27d:4212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.66.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc59f939528ff4eddad4fa9901c3.dl.dropboxusercontent.com/cd/0/inline/CoqdrknQ8GNeoXAjOO0WHErov8yB921c8f6MS9sa_s_WixgskijNfSL0-HtNd3s8k9BAb7ekYY8-GqkOTIESoGOw6a7QvSoEtLWzDHadPt2EqNfdS8jm6Ej2USMFKHtRr7xFRQXcAvZ_9PCXXWNNyo_V/file?dl=1# [following]
--2025-04-27 20:52:48--  https://uc59f939528ff4eddad4fa9901c3.dl.dropboxusercontent.com/cd/0/inline/CoqdrknQ8GNeoXAjOO0WHErov8yB921c8f6MS9sa_s_WixgskijNfSL0-HtNd3s8k9BAb7ekYY8-GqkOTIESoGOw6a7QvSoEtLWzDHadPt2EqNfdS8jm6Ej2USMFKHtRr7xFRQXcAvZ_9PCXXWNNyo_V/file?dl=1
Resolving uc59f939528ff4eddad4fa9901c3.dl.dropboxusercontent.com (uc59f939528ff4eddad4fa9901c3.dl.

In [11]:
# TRAINING DATASET
# Download the file from Dropbox
!wget -O VisDrone/VisDrone2019-DET-train.zip "https://www.dropbox.com/scl/fi/xyjppciooyq0juffv0g6y/VisDrone2019-DET-train.zip?rlkey=y5hnuo2imr1we88xzvndmkzct&st=6exf6hz8&dl=1"

# Unzip the file
!unzip -q VisDrone/VisDrone2019-DET-train.zip -d VisDrone/

--2025-04-27 20:53:09--  https://www.dropbox.com/scl/fi/xyjppciooyq0juffv0g6y/VisDrone2019-DET-train.zip?rlkey=y5hnuo2imr1we88xzvndmkzct&st=6exf6hz8&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.66.18, 2620:100:6022:18::a27d:4212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.66.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3925d32f4559b8f685a5bed7b6.dl.dropboxusercontent.com/cd/0/inline/CopqdumQfGld-VN4ua3r_uTJkm9kHzKmJTxHOWX7QnfjIwRyU_1GV7FrfFukXqk0z5hbnvoOUuex2qTmbGqJbRTpiZLkzcwf_cIJxGKYLq1MgaZXLFt_NCVf8u8YDIQvnJl65F9QZEwGea6kI_zpzV9t/file?dl=1# [following]
--2025-04-27 20:53:10--  https://uc3925d32f4559b8f685a5bed7b6.dl.dropboxusercontent.com/cd/0/inline/CopqdumQfGld-VN4ua3r_uTJkm9kHzKmJTxHOWX7QnfjIwRyU_1GV7FrfFukXqk0z5hbnvoOUuex2qTmbGqJbRTpiZLkzcwf_cIJxGKYLq1MgaZXLFt_NCVf8u8YDIQvnJl65F9QZEwGea6kI_zpzV9t/file?dl=1
Resolving uc3925d32f4559b8f685a5bed7b6.dl.dropboxusercontent.com (uc3925d32f4559b8f685a5bed7b6.dl.dro

In [12]:
# VALIDATION DATASET
# Download the file from Dropbox
!wget -O VisDrone/VisDrone2019-DET-val.zip "https://www.dropbox.com/scl/fi/op5lfc9g1eqjx0hmz5k66/VisDrone2019-DET-val.zip?rlkey=06rpa2gcfdzw1dc8vud39bypr&st=v1ndh3zz&dl=1"

# Unzip the file
!unzip -q VisDrone/VisDrone2019-DET-val.zip -d VisDrone/

--2025-04-27 20:59:34--  https://www.dropbox.com/scl/fi/op5lfc9g1eqjx0hmz5k66/VisDrone2019-DET-val.zip?rlkey=06rpa2gcfdzw1dc8vud39bypr&st=v1ndh3zz&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.66.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.66.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc45a8c7738108004466bab07a95.dl.dropboxusercontent.com/cd/0/inline/Cor9CKfnFhVlIkoeQmq7M1VrOyf13vYbklWTvRpJuqkQOLRQo08EgVlGcqfBqnreVMV4e5qT9SajQ6sC1IbEBlu_3_Axg07HLbe_w_MiczS77n3IgJ8N3f1wMLXsF6T6PlzoS1i5yAz5PhfJjJGXREOm/file?dl=1# [following]
--2025-04-27 20:59:35--  https://uc45a8c7738108004466bab07a95.dl.dropboxusercontent.com/cd/0/inline/Cor9CKfnFhVlIkoeQmq7M1VrOyf13vYbklWTvRpJuqkQOLRQo08EgVlGcqfBqnreVMV4e5qT9SajQ6sC1IbEBlu_3_Axg07HLbe_w_MiczS77n3IgJ8N3f1wMLXsF6T6PlzoS1i5yAz5PhfJjJGXREOm/file?dl=1
Resolving uc45a8c7738108004466bab07a95.dl.dropboxusercontent.com (uc45a8c7738108004466bab07a95.dl.dropbo

In [13]:
import glob

def get_image_and_annotation_paths(image_dir, annotation_dir):
  # Use image / annotation paths to map to actual training / testing data
    image_paths = sorted(glob.glob(os.path.join(image_dir, '*.jpg')))
    annotation_paths = [
        os.path.join(annotation_dir, os.path.basename(p).replace('.jpg', '.txt'))
        for p in image_paths
    ]
    return image_paths, annotation_paths

  # Update paths for your train/val/test sets
train_image_paths, train_annotation_paths = get_image_and_annotation_paths(
    'VisDrone/VisDrone2019-DET-train/images',
    'VisDrone/VisDrone2019-DET-train/annotations'
)

val_image_paths, val_annotation_paths = get_image_and_annotation_paths(
    'VisDrone/VisDrone2019-DET-val/images',
    'VisDrone/VisDrone2019-DET-val/annotations'
)

test_image_paths, test_annotation_paths = get_image_and_annotation_paths(
    'VisDrone/VisDrone2019-DET-test-dev/images',
    'VisDrone/VisDrone2019-DET-test-dev/annotations'
)

In [None]:
"""""""""""""""""""""""""""""""""

Description:

The next two blocks of code define a Dataset class, and then utilize that class
to create 3 instances, one for the training, validation, and testing splits.

We then use those objects to create our Dataloaders, such that our data is ready
for use in training our model.

"""""""""""""""""""""""""""""""""

In [15]:
class VisDroneDataset(Dataset):
    def __init__(self, image_paths, annotation_paths, resize_to=(640, 640), transforms=None, device='cpu'):

        self.image_paths = image_paths
        self.annotation_paths = annotation_paths

        self.resize_to = resize_to
        self.transforms = transforms
        self.device = device

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
      # Load image
      img_path = self.image_paths[idx]
      img = cv2.imread(img_path)
      # Error check to skip faulty image data
      if img is None:
          print(f"Failed to load image: {img_path}, skipping.")
          return self.__getitem__((idx + 1) % len(self))

      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
      img = cv2.resize(img, self.resize_to)
      img = torch.tensor(img / 255.0, dtype=torch.float32).permute(2, 0, 1)

      # Load annotation
      ann_path = self.annotation_paths[idx]
      boxes = []
      labels = []

      try:
          with open(ann_path, 'r') as f:
              for line in f:
                  line = line.strip()
                  if not line:
                      continue
                  try:
                      vals = list(map(int, line.split(',')))
                      x, y, w, h, cls_id = vals[0], vals[1], vals[2], vals[3], vals[5]

                      # Skip invalid boxes
                      if w <= 0 or h <= 0:
                          continue

                      x2, y2 = x + w, y + h
                      if x2 <= x or y2 <= y:
                          continue

                      # Skip invalid labels
                      if cls_id <= 0:
                          continue

                      boxes.append([x, y, x2, y2])
                      labels.append(cls_id - 1)
                  except ValueError:
                      print(f"Skipping annotation in: {ann_path}")
                      continue
      except FileNotFoundError:
          print(f"Missing annotation file: {ann_path}, skipping.")
          return self.__getitem__((idx + 1) % len(self))

      # Skip samples with no valid annotations
      if len(boxes) == 0:
          print(f"No valid boxes in {img_path}, skipping.")
          return self.__getitem__((idx + 1) % len(self))

      boxes = torch.tensor(boxes, dtype=torch.float32)
      labels = torch.tensor(labels, dtype=torch.int64)

      target = {'boxes': boxes, 'labels': labels}
      return img, target




In [16]:
# Use the following lines when training on the full set
# train_dataset = VisDroneDataset(train_image_paths, train_annotation_paths, resize_to=(512, 512))
# val_dataset = VisDroneDataset(val_image_paths, val_annotation_paths, resize_to=(512, 512))
# test_dataset = VisDroneDataset(test_image_paths, test_annotation_paths, resize_to=(512, 512))

# For now, using these lines to limit the number of samples trained on, for exploration

full_train_dataset = VisDroneDataset(train_image_paths, train_annotation_paths, resize_to=(512, 512), device='cpu')
full_val_dataset = VisDroneDataset(val_image_paths, val_annotation_paths, resize_to=(512, 512), device='cpu')
full_test_dataset = VisDroneDataset(test_image_paths, test_annotation_paths, resize_to=(512, 512), device='cpu')

# Find valid sample indices for training dataset
valid_indices = []
for i in range(len(full_train_dataset)):
    try:
        img, target = full_train_dataset[i]
        if len(target['boxes']) > 0:
            valid_indices.append(i)
        # Break early for sample testing
        if len(valid_indices) >= 200:
            break
    except Exception:
        continue

# Use only the first 200 valid entries
train_dataset = Subset(full_train_dataset, valid_indices)

# Find valid sample indices for validation dataset
valid_indices = []
for i in range(len(full_val_dataset)):
    try:
        img, target = full_val_dataset[i]
        if len(target['boxes']) > 0:
            valid_indices.append(i)
        if len(valid_indices) >= 200:
            break
    except Exception:
        continue

# Use only the first 200 valid entries
val_dataset = Subset(full_val_dataset, valid_indices)

# Find valid sample indices for validation dataset
valid_indices = []
for i in range(len(full_test_dataset)):
    try:
        img, target = full_test_dataset[i]
        if len(target['boxes']) > 0:
            valid_indices.append(i)
        if len(valid_indices) >= 200:
            break
    except Exception:
        continue

# Use only the first 200 valid entries
test_dataset = Subset(full_test_dataset, valid_indices)

# Create dataloaders for each subset

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,
                                   collate_fn=lambda x: tuple(zip(*x)))
valid_loader = DataLoader(val_dataset, batch_size=4, shuffle=False,
                                    collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False,
                                    collate_fn=lambda x: tuple(zip(*x)))

In [17]:
# label map
label_map = {
    0: 'Ignored',
    1: 'Pedestrian',
    2: 'Person',
    3: 'Car',
    4: 'Van',
    5: 'Bus',
    6: 'Truck',
    7: 'Motor',
    8: 'Bicycle',
    9: 'Awning-tricycle',
    10: 'Tricycle',
    11: 'Other'
}

In [18]:
import torchvision
import torch.nn as nn
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, TwoMLPHead
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign

# Modified TwoMLPHead WITH dropout
class TwoMLPHeadWithDropout(TwoMLPHead):
    def __init__(self, in_channels, representation_size, dropout_prob=0.5):
        super().__init__(in_channels, representation_size)

        # Inject dropout after the two FC layers
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.dropout2 = nn.Dropout(p=dropout_prob)

    def forward(self, x):
        x = x.flatten(start_dim=1)
        x = self.fc6(x)
        x = nn.functional.relu(x)
        x = self.dropout1(x)
        x = self.fc7(x)
        x = nn.functional.relu(x)
        x = self.dropout2(x)
        return x

def get_custom_fasterrcnn_model(num_classes, dropout_prob=0.5):
    backbone = resnet_fpn_backbone('resnet50', pretrained=True)

    anchor_generator = AnchorGenerator(
        sizes=((16,), (32,), (64,), (128,), (256,)),
        aspect_ratios=((0.5, 1.0, 2.0),) * 5
    )

    roi_pooler = MultiScaleRoIAlign(
        featmap_names=['0', '1', '2', '3'],
        output_size=7,
        sampling_ratio=2
    )

    model = FasterRCNN(
        backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    in_features = model.roi_heads.box_head.fc6.in_features
    model.roi_heads.box_head = TwoMLPHeadWithDropout(
        in_channels=in_features,
        representation_size=1024,
        dropout_prob=dropout_prob
    )

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model


In [22]:
import torch
import torch.optim as optim
import torch.nn as nn

def train_model(model, train_loader, valid_loader, num_epochs=50, lr=0.001, device=None):

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.Adam(params, lr=lr)

    # To store loss history
    train_loss_history = []
    val_loss_history = []

    print(f"Training started on device: {device}")

    for epoch in range(num_epochs):
        model.train()
        running_train_loss = 0.0
        batch_count = 0

        for batch in train_loader:
            images, targets = batch
            batch_count += 1

            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            optimizer.zero_grad()

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            losses.backward()
            optimizer.step()

            running_train_loss += losses.item()

        avg_train_loss = running_train_loss / len(train_loader)
        train_loss_history.append(avg_train_loss)

        model.eval()
        running_val_loss = 0.0
        with torch.no_grad(): # Ensure no gradients are calculated during validation
             for batch in valid_loader:
                images, targets = batch
                batch_count += 1

                images = list(image.to(device) for image in images)
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

                optimizer.zero_grad()

                loss_dict = model(images, targets)

                losses = sum(loss for loss in loss_dict.values())

                losses.backward()
                optimizer.step()

                running_val_loss += losses.item()

        # --- Epoch End ---
        # Print training loss
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}")

    print("Training finished.")
    return model, train_loss_history, val_loss_history # Return empty list for val loss history

In [23]:
# Initialize and train the model
def initialize_and_train_model():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Number of classes in VisDrone dataset (10 classes + background)
    num_classes = len(label_map) + 1

    # Initialize model
    model = get_custom_fasterrcnn_model(num_classes=num_classes, dropout_prob=0.5)

    # Train the model
    trained_model, train_losses, val_losses = train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        num_epochs=10,
        lr=0.001,
        device=device
    )

    return trained_model, train_losses, val_losses

In [None]:
# Run the training
custom_model, train_loss_history, val_loss_history = initialize_and_train_model()

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_loss_history, label='Training Loss')
plt.plot(val_loss_history, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# Evaluate the model on test set
custom_model.eval()
map_metric = MeanAveragePrecision()

with torch.no_grad():
    for images, targets in tqdm(test_loader, desc="Evaluating custom model"):
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        predictions = custom_model(images)
        map_metric.update(predictions, targets)

# Compute and print mAP metrics
map_results = map_metric.compute()
print(f"Custom Model Results:")
print(f"mAP (0.50:0.95): {map_results['map']:.8f}")
print(f"mAP@0.50:        {map_results['map_50']:.8f}")
print(f"mAP@0.75:        {map_results['map_75']:.8f}")
print(f"mAP (small):     {map_results['map_small']:.8f}")
print(f"mAP (medium):    {map_results['map_medium']:.8f}")
print(f"mAP (large):     {map_results['map_large']:.8f}")