# Drone Detection - Mask R-CNN (ResNet-based)

In [1]:
!mkdir ~/.kaggle

In [2]:
!cp kaggle.json ~/.kaggle/

In [3]:
!kaggle datasets download -d cybersimar08/drone-detection

Dataset URL: https://www.kaggle.com/datasets/cybersimar08/drone-detection
License(s): CC0-1.0
Downloading drone-detection.zip to /content
 99% 494M/499M [00:06<00:00, 38.8MB/s]
100% 499M/499M [00:06<00:00, 84.9MB/s]


In [4]:
# Unzip dataset file
import zipfile
with zipfile.ZipFile('drone-detection.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [5]:
!pwd

/content


In [22]:
import os
import re
import time
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset


####################################################
# Raw Data Helpers #################################
####################################################

# NOTE: exclude BIRD class
CLASS_MAPPING = {
    0: "AIRPLANE",
    1: "DRONE",
    2: "HELICOPTER"
}

def view_df_summary(df):
    print("Dataset Summary:")
    print(f"Total number of objects: {len(df)}")
    print(f"Total number of unique images: {df['image_file'].nunique()}")

    print("\nClass distribution:")
    print(df['class'].value_counts())

    print("\nImage dimensions summary:")
    print(df[['image_width', 'image_height']].describe())

def get_matching_files(images_path, labels_path):
    image_files = [f for f in os.listdir(images_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
    label_files = [f for f in os.listdir(labels_path) if f.endswith('.txt')]

    image_files = [f for f in image_files if 'BIRD' not in f]
    label_files = [f for f in label_files if 'BIRD' not in f]

    image_map = {os.path.splitext(f)[0]: f for f in image_files}
    label_map = {os.path.splitext(f)[0]: f for f in label_files}

    common_keys = set(image_map.keys()) & set(label_map.keys())

    matched_files = [(image_map[k], label_map[k]) for k in common_keys]
    return matched_files

def extract_info_from_filename(filename):
    match = re.match(r'V_([A-Z]+)_(\d+_\d+)_', filename)
    if match:
        class_name, image_id = match.groups()
        return class_name, image_id
    return None, None

def read_label_file(label_path):
    try:
        with open(label_path, 'r') as f:
            lines = f.readlines()

        boxes = []
        for line in lines:
            class_id, x_center, y_center, width, height = map(float, line.strip().split())
            boxes.append({
                'class_id': int(class_id),
                'class_name': CLASS_MAPPING[int(class_id)],
                'x_center': x_center,
                'y_center': y_center,
                'width': width,
                'height': height
            })
        return boxes
    except Exception as e:
        print(f"Error reading {label_path}: {e}")
        return []

def create_dataset(images_path, labels_path):
    matched_files = get_matching_files(images_path, labels_path)

    data = []
    for img_file, label_file in matched_files:
        class_name, image_id = extract_info_from_filename(img_file)

        img_path = os.path.join(images_path, img_file)
        with Image.open(img_path) as img:
            img_width, img_height = img.size

        label_path = os.path.join(labels_path, label_file)
        boxes = read_label_file(label_path)

        for box in boxes:
            if box["class_name"] != class_name:
                print(f"Found mismatch between filename class and detected class: {image_id}")

            data.append({
                'image_file': img_file,
                'label_file': label_file,
                'image_id': image_id,
                'class': box['class_name'],
                'x_center': box['x_center'],
                'y_center': box['y_center'],
                'width': box['width'],
                'height': box['height'],
                'image_width': img_width,
                'image_height': img_height,
                'image_path': img_path,
                'label_path': label_path
            })

    return pd.DataFrame(data)


####################################################
# Metrics ##########################################
####################################################

def calculate_iou(pred_box, target_box):
    """Calculate IoU between single predicted and target box"""
    # Convert center format to corners
    pred_x1 = pred_box[0] - pred_box[2] / 2
    pred_y1 = pred_box[1] - pred_box[3] / 2
    pred_x2 = pred_box[0] + pred_box[2] / 2
    pred_y2 = pred_box[1] + pred_box[3] / 2

    target_x1 = target_box[0] - target_box[2] / 2
    target_y1 = target_box[1] - target_box[3] / 2
    target_x2 = target_box[0] + target_box[2] / 2
    target_y2 = target_box[1] + target_box[3] / 2

    # Calculate intersection
    x1 = max(pred_x1, target_x1)
    y1 = max(pred_y1, target_y1)
    x2 = min(pred_x2, target_x2)
    y2 = min(pred_y2, target_y2)

    intersection = max(0, x2 - x1) * max(0, y2 - y1)

    # Calculate union
    pred_area = (pred_x2 - pred_x1) * (pred_y2 - pred_y1)
    target_area = (target_x2 - target_x1) * (target_y2 - target_y1)
    union = pred_area + target_area - intersection

    return intersection / (union + 1e-6)

def calculate_map(pred_boxes, pred_scores, target_boxes, iou_threshold=0.5):
    """Calculate mAP for the predictions"""
    aps = []

    for class_id in range(3):  # For each class (AIRPLANE, DRONE, HELICOPTER)
        precisions = []
        recalls = []

        # Get predictions for this class
        class_preds = [i for i, score in enumerate(pred_scores) if torch.argmax(score) == class_id]
        class_targets = [i for i, box in enumerate(target_boxes) if box[0] == class_id]

        if len(class_targets) == 0:
            continue

        # Calculate precision and recall at different confidence thresholds
        for threshold in np.arange(0, 1, 0.1):
            tp = fp = fn = 0

            for pred_idx in class_preds:
                pred_box = pred_boxes[pred_idx]
                max_iou = 0
                best_target_idx = -1

                # Find best matching target box
                for target_idx in class_targets:
                    target_box = target_boxes[target_idx][1:]  # Remove class_id
                    iou = calculate_iou(pred_box, target_box)
                    if iou > max_iou:
                        max_iou = iou
                        best_target_idx = target_idx

                if max_iou >= iou_threshold and pred_scores[pred_idx][class_id] >= threshold:
                    tp += 1
                else:
                    fp += 1

            fn = len(class_targets) - tp

            precision = tp / (tp + fp + 1e-6)
            recall = tp / (tp + fn + 1e-6)

            precisions.append(precision)
            recalls.append(recall)

        if len(precisions) > 0:
            ap = np.trapz(precisions, recalls)
            aps.append(ap)

    return np.mean(aps) if len(aps) > 0 else 0

def evaluate_model(model, dataset, num_samples=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    total_params = sum(p.numel() for p in model.parameters())
    model_size_mb = total_params * 4 / (1024 * 1024)  # Assuming float32 (4 bytes)

    ious = []
    f1_scores = []
    inference_times = []
    pred_boxes_all = []
    pred_scores_all = []
    target_boxes_all = []

    print("\nEvaluating model on", num_samples, "samples...")

    with torch.no_grad():
        for i in range(num_samples):
            image, target_boxes = dataset[i]
            image = image.unsqueeze(0).to(device)
            target_boxes = target_boxes.to(device)

            # Measure inference time
            start_time = time.time()
            pred_scores, pred_boxes = model(image)
            inference_time = time.time() - start_time
            inference_times.append(inference_time)

            # Store predictions and targets
            pred_boxes_all.append(pred_boxes[0])
            pred_scores_all.append(pred_scores[0])
            target_boxes_all.append(target_boxes[0])

            # Calculate IoU for valid boxes
            if target_boxes[0, 0] != -1:  # If there is a valid target box
                iou = calculate_iou(pred_boxes[0, 0].cpu(), target_boxes[0, 1:].cpu())
                ious.append(iou)

            # Calculate F1 Score
            pred_class = torch.argmax(pred_scores[0, 0]).cpu()
            true_class = target_boxes[0, 0].cpu()
            if true_class != -1:
                f1_scores.append(pred_class == true_class)

    # Calculate mAP
    map_score = calculate_map(pred_boxes_all, pred_scores_all, target_boxes_all)

    # Print results
    print("\nModel Evaluation Metrics:")
    print("-" * 50)
    print(f"Model Size: {model_size_mb:.2f} MB ({total_params:,} parameters)")
    print(f"Average Inference Latency: {np.mean(inference_times)*1000:.2f} ms")
    print(f"Mean IoU: {np.mean(ious):.4f}")
    print(f"mAP: {map_score:.4f}")
    print(f"F1 Score: {np.mean(f1_scores):.4f}")

    return {
        'model_size': model_size_mb,
        'inference_latency': np.mean(inference_times),
        'mean_iou': np.mean(ious),
        'map': map_score,
        'f1_score': np.mean(f1_scores)
    }


In [7]:
import os
import re
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torchvision
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F

sns.set_theme()

In [8]:
device = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

## Load Dataset

In [10]:
base_path = "drone-detection-new.v5-new-train.yolov8/train"
images_path = os.path.join(base_path, "images")
labels_path = os.path.join(base_path, "labels")

df = create_dataset(images_path, labels_path)

In [11]:
view_df_summary(df)

Dataset Summary:
Total number of objects: 8997
Total number of unique images: 8818

Class distribution:
class
DRONE         4349
HELICOPTER    2374
AIRPLANE      2274
Name: count, dtype: int64

Image dimensions summary:
       image_width  image_height
count       8997.0        8997.0
mean         640.0         640.0
std            0.0           0.0
min          640.0         640.0
25%          640.0         640.0
50%          640.0         640.0
75%          640.0         640.0
max          640.0         640.0


## Model Training

In [12]:
# Define class mappings
CLASS_MAPPING = {"AIRPLANE": 0, "DRONE": 1, "HELICOPTER": 2}

# Custom Dataset
class DroneDetectionDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform
        self.image_paths = dataframe["image_path"].unique()

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")

        # Get all annotations for this image
        records = self.df[self.df["image_path"] == image_path]

        # Convert normalized YOLO box format to absolute pixel values
        boxes = []
        labels = []
        img_width, img_height = image.size

        for _, row in records.iterrows():
            x_center = row["x_center"] * img_width
            y_center = row["y_center"] * img_height
            width = row["width"] * img_width
            height = row["height"] * img_height

            # Convert YOLO format (x_center, y_center, width, height) to (x_min, y_min, x_max, y_max)
            x_min = x_center - width / 2
            y_min = y_center - height / 2
            x_max = x_center + width / 2
            y_max = y_center + height / 2

            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(CLASS_MAPPING[row["class"]])

        # Convert to tensors
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        # Prepare target dictionary
        target = {
            "boxes": boxes,
            "labels": labels
        }

        return image, target

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor()
])

In [13]:
dataset = DroneDetectionDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [14]:
# Load pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 4  # Background + 3 object classes

# Modify classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:00<00:00, 182MB/s]


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
num_epochs = 1

def train_one_epoch(model, optimizer, dataloader, device):
    model.train()
    total_loss = 0

    for images, targets in tqdm(dataloader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        losses.backward()
        optimizer.step()

        total_loss += losses.item()

    return total_loss / len(dataloader)

for epoch in range(num_epochs):
    loss = train_one_epoch(model, optimizer, dataloader, device)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}")

100%|██████████| 2205/2205 [27:54<00:00,  1.32it/s]

Epoch [1/1], Loss: 0.0949





In [25]:
summary(model, (1, 640, 640))

AttributeError: 'ImageList' object has no attribute 'size'

In [21]:
def predict(model, image_path, device):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([transforms.ToTensor()])
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        predictions = model(image)

    return predictions

image_path = "/content/drone-detection-new.v5-new-train.yolov8/test/images/V_AIRPLANE_0011_025_png.rf.1224f43106fb9462c60db7add1b26996.jpg"
predictions = predict(model, image_path, device)

print(predictions)

for i, (box, label, score) in enumerate(zip(predictions[0]["boxes"], predictions[0]["labels"], predictions[0]["scores"])):
    if score > 0.0:  # Filter out low-confidence detections
        print(f"Object {i}: Class {label.item()}, Score: {score.item()}, Box: {box.tolist()}")


[{'boxes': tensor([], device='cuda:0', size=(0, 4)), 'labels': tensor([], device='cuda:0', dtype=torch.int64), 'scores': tensor([], device='cuda:0')}]
