## Assumptions made:
#### 1.) Images and corresponding label files have the same filename.
#### 2.) Images file extention(s) are => .png, .jpg
#### 3.) Label file extention(s) are => .txt
#### 4.) Record in label file format=> [class_id, xmin, ymin, xmax, ymax]
#### 5.) class_id is an integer and NOT vehicle-type in string
#### 6.) Torch tensor of labels dtype is 64-bit ints and torch 2-D tensor of bounding boxes dtype is 32-bit floats. Precision can be reduced for speed

## Imports

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T

## Dataset and Dataloader

In [None]:
class VehicleDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform

        self.image_filenames = sorted([
            fname for fname in os.listdir(image_dir) if fname.endswith(('.jpg', '.png'))
        ])

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        # Process Image
        img_name = self.image_filenames[idx]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        W, H = image.size

        # Process label(s) - (1) vehicle class and (2) bounding box coordinates
        label_path = os.path.join(self.label_dir, img_name.replace('.jpg', '.txt').replace('.png', '.txt'))
        labels = []
        boxes = []
        with open(label_path, 'r') as f:
            for line in f.readlines():
                parts = line.strip().split()

                class_id = int(parts[0])
                xmin, ymin, xmax, ymax = map(float, parts[1:])

                labels.append(class_id)
                boxes.append([xmin, ymin, xmax, ymax])

        # Python list -> PyTorch tensor
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        target = {
            "labels": labels,
            "boxes": boxes,
            # "image_id": torch.tensor([idx])
        }

        # Apply any transforms: resizing (if required) and converting to PyTorch tensor
        if self.transform:
            image = self.transform(image)

        # Return format: (<image>, {labels: tensor<1,2,...,n>, boxes: tensor<[tl_x, tl_y, br_x, br_y],...>})
        return image, target

# Transform: (mostly not required to resize)
transform = T.Compose([
    T.Resize((1920, 1080)),
    T.ToTensor()
])

# Instantiate dataset and dataloader pair(s)
dataset = VehicleDataset(image_dir="images", label_dir="labels", transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True, collate_fn=lambda x: tuple(zip(*x)))

## For understanding only

In [None]:
from torch.utils.data import DataLoader

data = [("img1", "label1"), ("img2", "label2"), ("img3", "label3"), ("img4", "label4")]
loader = DataLoader(data, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

for batch in loader:
    print(batch)

(('img3', 'img2'), ('label3', 'label2'))
(('img1', 'img4'), ('label1', 'label4'))
