# PyTorch method for car detection

In [3]:
# Import libraries
import os
import shutil
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import yaml
from skimage.io import imread
import imagesize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD, AdamW
from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR, OneCycleLR
from torch.cuda.amp import autocast, GradScaler
import torchvision
import torchvision.models.detection as detection
from torchvision import transforms
import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from PIL import Image
from IPython.display import Video

In [None]:
class YoloDataset(Dataset):
    def __init__(self, img_dir, label_dir, transform=None, mosaic_prob=0.5, mixup_prob=0.2):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.img_files = [f for f in os.listdir(img_dir) if f.endswith('.jpg') or f.endswith('.png')]
        self.transform = transform
        self.mosaic_prob = mosaic_prob
        self.mixup_prob = mixup_prob
        
    def __len__(self):
        return len(self.img_files)
    
    def load_image_and_labels(self, idx):
        img_path = os.path.join(self.img_dir, self.img_files[idx])
        label_path = os.path.join(self.label_dir, self.img_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Load image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img.shape[:2]
        
        # Load labels
        boxes = []
        labels = []
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f.readlines():
                    if line.strip():
                        cls, x_c, y_c, width, height = map(float, line.strip().split())
                        # Convert to absolute coordinates
                        x1 = (x_c - width/2) * w
                        y1 = (y_c - height/2) * h
                        x2 = (x_c + width/2) * w
                        y2 = (y_c + height/2) * h
                        boxes.append([x1, y1, x2, y2])
                        labels.append(int(cls) + 1)
        
        return img, boxes, labels
    
    def __getitem__(self, idx):
        img, boxes, labels = self.load_image_and_labels(idx)
        # Apply augmentations
        if self.transform:
            transformed = self.transform(image=img, bboxes=boxes, labels=labels)
            img = transformed['image']
            boxes = transformed['bboxes']
            labels = transformed['labels']
        
        boxes = torch.tensor(boxes, dtype=torch.float32) if boxes else torch.empty((0, 4), dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64) if labels else torch.empty((0,), dtype=torch.int64)
        
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([idx])
        }
        
        return img, target
    
    def get_augmentation_transforms(training=True):
        if training:
            return A.Compose([
                A.RandomResizedCrop(height=800, width=800, scale=(0.8, 1.0), ratio=(0.75, 1.33)),
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.1),
                A.Rotate(limit=15, p=0.3),
                A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
                A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.3),
                A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),
                A.GaussianBlur(blur_limit=3, p=0.1),
                A.Cutout(num_holes=8, max_h_size=32, max_w_size=32, fill_value=0, p=0.2),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ToTensorV2()
            ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels'], min_visibility=0.3))
        else:
            return A.Compose([
                A.Resize(height=800, width=800),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ToTensorV2()
            ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

    def build_optimized_model(num_classes, backbone='resnet50', pretrained=True):
        """Build model with different backbone options"""
        if backbone == 'resnet50':
            model = detection.fasterrcnn_resnet50_fpn(pretrained=pretrained)
        elif backbone == 'resnet101':
            model = detection.fasterrcnn_resnet101_fpn(pretrained=pretrained)
        elif backbone == 'mobilenet':
            model = detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=pretrained)
        
        in_features = model.roi_heads.box_predictor.cls_score.in_features
        model.roi_heads.box_predictor = detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
        
        return model