<a href="https://colab.research.google.com/github/leeIITM/MY_WORKS/blob/main/vit_yolov4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import urllib.request
import tarfile

# Directory to save dataset
dataset_dir = 'VOCdevkit/'

# URL for Pascal VOC 2007
voc2007_url = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar'

# Create directory if not exists
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)


# Download and extract Pascal VOC 2007
voc_tar = os.path.join(dataset_dir, 'VOCtrainval_06-Nov-2007.tar')
if not os.path.exists(voc_tar):
    print("Downloading Pascal VOC 2007 dataset...")
    urllib.request.urlretrieve(voc2007_url, voc_tar)
    print("Download complete.")

# Extract the dataset
if not os.path.exists(os.path.join(dataset_dir, 'VOC2007')):
    print("Extracting dataset...")
    with tarfile.open(voc_tar) as tar:
        tar.extractall(path=dataset_dir)
    print("Extraction complete.")
else:
    print("Dataset already extracted.")


Downloading Pascal VOC 2007 dataset...
Download complete.
Extracting dataset...
Extraction complete.


In [2]:
print(dataset_dir)
dataset_dir = "/content/VOCdevkit/VOCdevkit/"

VOCdevkit/


In [3]:
# Class mapping for VOC dataset
class_map = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19,
}


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import xml.etree.ElementTree as ET

# Define a custom collate function
def collate_fn(batch):
    images, boxes, labels = zip(*batch)
    images = torch.stack(images, dim=0)  # Stack images into a tensor
    return images, boxes, labels  # Return boxes and labels as lists

# VOCDataset with image and annotation processing
class VOCDataset(Dataset):
    def __init__(self, dataset_dir, image_set='trainval', transform=None):
        super(VOCDataset, self).__init__()
        self.dataset_dir = dataset_dir
        self.image_set = image_set
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])
        self.image_ids = self._load_image_set()

    def _load_image_set(self):
        split_file = os.path.join(self.dataset_dir, 'VOC2007', 'ImageSets', 'Main', f'{self.image_set}.txt')
        with open(split_file, 'r') as f:
            image_ids = [line.strip() for line in f]
        return image_ids

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, index):
        image_id = self.image_ids[index]

        # Loading the image
        image_path = os.path.join(self.dataset_dir, 'VOC2007', 'JPEGImages', f'{image_id}.jpg')
        image = Image.open(image_path).convert('RGB')

        # Loading annotations
        annotation_path = os.path.join(self.dataset_dir, 'VOC2007', 'Annotations', f'{image_id}.xml')
        boxes, labels = self._load_annotation(annotation_path)

        # Converting image to tensor and resize
        if self.transform is not None:
            image = self.transform(image)

        return image, boxes, labels

    def _load_annotation(self, annotation_path):
    # Parse the XML file to extract bounding box info and labels
      tree = ET.parse(annotation_path)
      root = tree.getroot()

      boxes = []
      labels = []
      for obj in root.findall('object'):
          bbox = obj.find('bndbox')
          xmin = int(bbox.find('xmin').text)
          ymin = int(bbox.find('ymin').text)
          xmax = int(bbox.find('xmax').text)
          ymax = int(bbox.find('ymax').text)
          boxes.append([xmin, ymin, xmax, ymax])

          label = obj.find('name').text
          if label in class_map:
              labels.append(class_map[label])
          else:
              print(f"Warning: Invalid class '{label}' in {annotation_path}")
              labels.append(-1)  # or handle it appropriately

      return boxes, labels

# Correct dataset path
dataset = VOCDataset(dataset_dir='/content/VOCdevkit/VOCdevkit')

# DataLoader to load data in batches using the custom collate function
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Test loading a batch of data
for images, boxes, labels in data_loader:

    print(f"Images shape: {images.shape}")
    print(f"Boxes: {boxes}")
    print(f"Labels: {labels}")
    break

Images shape: torch.Size([8, 3, 224, 224])
Boxes: ([[52, 55, 479, 371], [109, 39, 178, 64], [254, 43, 270, 56], [220, 43, 236, 54]], [[4, 164, 164, 303], [99, 169, 281, 303], [214, 173, 355, 283], [286, 175, 433, 285], [375, 180, 484, 272]], [[2, 29, 195, 500], [130, 181, 347, 500]], [[3, 1, 365, 500]], [[302, 129, 449, 227], [93, 156, 288, 312], [89, 204, 116, 262]], [[40, 124, 97, 255], [92, 214, 122, 252], [58, 95, 202, 299], [175, 113, 281, 293], [203, 95, 487, 375], [7, 143, 61, 273], [372, 180, 500, 375]], [[15, 3, 476, 344], [97, 115, 443, 344]], [[488, 276, 500, 320], [475, 251, 486, 273], [279, 225, 478, 340], [245, 230, 316, 280], [1, 241, 87, 375], [133, 244, 184, 367], [121, 273, 213, 367]])
Labels: ([6, 6, 6, 6], [5, 5, 5, 5, 5], [14, 14], [14], [17, 17, 15], [12, 12, 12, 12, 12, 14, 14], [14, 8], [6, 6, 6, 6, 6, 14, 13])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from timm import create_model

class ViTBackbone(nn.Module):
    def __init__(self, model_name='vit_base_patch16_224', pretrained=True):
        super(ViTBackbone, self).__init__()
        self.vit = create_model(model_name, pretrained=pretrained, num_classes=0)  # No final classification layer
        self.patch_size = 16
        self.out_channels = 768

    def forward(self, x):

        _, _, H, W = x.shape
        assert H % self.patch_size == 0 and W % self.patch_size == 0, "Input size must be divisible by patch size"

        # Extracting features using Vision Transformer
        features = self.vit(x)  # Shape: [batch_size, num_patches, feature_dim]
        #print(f"ViT features shape: {features.shape}")

        if features.ndim == 2:  # Shape: [batch_size, feature_dim]
            B, C = features.shape
            grid_size = int(H / self.patch_size)
            # Reshape to [batch_size, channels, height, width]
            features = features.reshape(B, C, 1, 1).expand(B, C, grid_size, grid_size)
        else:
            B, N, C = features.shape  # B=batch_size, N=num_patches, C=feature_dim
            grid_size = int(H / self.patch_size)
            features = features.reshape(B, grid_size, grid_size, C).permute(0, 3, 1, 2)

        #print(f"ViT reshaped features shape: {features.shape}")
        return features

# YOLOv4 Neck (without SPP) bcoz i faced some errors with SPP
class YOLOv4Neck(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(YOLOv4Neck, self).__init__()

        # Simple PANet-like neck without SPP
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        # Apply simple PANet-style convolution layers
        x = self.conv1(x)
        #print(f"Neck output shape after conv1: {x.shape}")
        x = self.conv2(x)
        #print(f"Neck output shape after conv2: {x.shape}")

        return x

# YOLOv4 Head (Final Detection Layer)
class YOLOv4Head(nn.Module):
    def __init__(self, in_channels, num_classes, num_anchors):
        super(YOLOv4Head, self).__init__()

        # Final detection head with convolutional layers for bounding box prediction
        self.conv1 = nn.Conv2d(in_channels, in_channels * 2, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels * 2, num_anchors * (5 + num_classes), kernel_size=1)  # BBox coords, obj score, classes
        self.pool = nn.AdaptiveAvgPool2d((7, 7))  # Ensure the output is [batch_size, channels, 7, 7]

    def forward(self, x, num_anchors, num_classes):
        x = self.conv1(x)
        #print(f"Head output shape after conv1: {x.shape}")
        x = self.conv2(x)
        #print(f"Head output shape after conv2: {x.shape}")
        x = self.pool(x)
        #print(f"Head output shape after pool: {x.shape}")

        # Reshape the output to match the anchors and classes
        batch_size = x.size(0)
        output_reshaped = x.view(batch_size, num_anchors, 5 + num_classes, 7, 7).permute(0, 3, 4, 1, 2)
        #print(num_anchors)
        # Now output_reshaped has the shape [batch_size, 7, 7, num_anchors, 5 + num_classes]
        output_final = output_reshaped.contiguous().view(batch_size, 7, 7, (5 + num_classes) * num_anchors)
        return output_final

# Full YOLOv4 with ViT Backbone
class YOLOv4ViT(nn.Module):
    def __init__(self, num_classes, num_anchors=3):
        super(YOLOv4ViT, self).__init__()

        # Vision Transformer as the backbone
        self.backbone = ViTBackbone()

        # YOLOv4 Neck (No SPP here)
        self.neck = YOLOv4Neck(in_channels=self.backbone.out_channels, out_channels=256)

        # YOLOv4 Head (Detection head with anchors)
        self.head = YOLOv4Head(in_channels=256, num_classes=num_classes, num_anchors=num_anchors)

    def forward(self, x):
        #Extracting features from the image using ViT backbone
        features = self.backbone(x)

        #Passing features through the neck (PANet)
        neck_output = self.neck(features)

        #Passing neck output to the YOLOv4 head for final detection
        detection_output = self.head(neck_output, num_anchors=3, num_classes=20)

        return detection_output

# Define a simple loss function (e.g., combination of classification and bounding box loss)
class YOLOLoss(nn.Module):
    def __init__(self, num_classes):
        super(YOLOLoss, self).__init__()
        self.num_classes = num_classes

    def forward(self, predictions, targets):
        # Predictions and targets shape: [batch_size, num_anchors * (5 + num_classes), H, W]
        # The first 4 values in the last dimension correspond to bounding box coordinates
        # The 5th value is the objectness score, followed by class scores

        # Calculate loss (you can refine this for your specific loss function)
        loss = 0.0
        # For simplicity, use a placeholder loss (e.g., Mean Squared Error)
        loss += nn.MSELoss()(predictions, targets)  # Replace with your detailed loss calculations

        return loss

# Instantiate the model
num_classes = 20  # Number of classes in the Pascal VOC dataset
model = YOLOv4ViT(num_classes=num_classes)

# Define the loss function
criterion = YOLOLoss(num_classes)

# Set up the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjust learning rate as necessary

# Move loss function and optimizer to the appropriate device (if you're using GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

# Print a summary of the optimizer
print(optimizer)

In [23]:
print(model)

YOLOv4ViT(
  (backbone): ViTBackbone(
    (vit): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=768, out_features=768, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=768, o

In [13]:
def format_targets(boxes, labels, num_classes, img_size, grid_size):
    targets = torch.zeros((len(boxes), grid_size, grid_size, 3*(5 + num_classes)))

    for i, (box_list, label_list) in enumerate(zip(boxes, labels)):
        #print(f"Processing box list: {box_list}")
        if not isinstance(box_list, list) or len(box_list) == 0:
            print(f"Skipping invalid box: {box_list}")
            continue

        for box, label in zip(box_list, label_list):
            if len(box) != 4:
                print(f"Skipping invalid box: {box}")
                continue

            # Convert label to integer if it is a string
            if isinstance(label, str):
                try:
                    label = int(label)
                except ValueError:
                    print(f"Skipping invalid label: {label}")
                    continue

            # Calculate box center, width, and height in normalized coordinates
            center_x = (box[0] + box[2]) / 2  # x_min + x_max
            center_y = (box[1] + box[3]) / 2  # y_min + y_max
            box_width = box[2] - box[0]
            box_height = box[3] - box[1]

            # Calculate grid cell location and clamp values to avoid out-of-bounds indexing
            grid_x = min(int(center_x * grid_size / img_size[0]), grid_size - 1)
            grid_y = min(int(center_y * grid_size / img_size[1]), grid_size - 1)

            # Assign box parameters to the targets tensor
            targets[i, grid_y, grid_x, 0] = 1  # Objectness score
            targets[i, grid_y, grid_x, 1:5] = torch.tensor([center_x / img_size[0], center_y / img_size[1], box_width / img_size[0], box_height / img_size[1]])  # Box coordinates normalized
            targets[i, grid_y, grid_x, 5 + label] = 1  # One-hot encoding of the class label

    return targets




In [14]:
def train_model(model, data_loader, criterion, optimizer, num_epochs, device):
    model = model.to(device)

    for epoch in range(num_epochs):
        model.train()

        for images, boxes, labels in data_loader:
            images = images.to(device)  # Move images to the GPU
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)

            # Prepare targets
            img_size = (images.shape[2], images.shape[3])  # (height, width) of the input image
            grid_size = 7  # Adjust this based on the grid size in your YOLO model

            targets = format_targets(boxes, labels, num_classes, img_size, grid_size)
            #print("target tensor")
            #print(targets.shape)
            #print("output tensor")
            #print(outputs.shape)
            targets = targets.to(device)


            loss = criterion(outputs, targets)


            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


In [15]:

num_anchors = 3
num_classes = 20
model = YOLOv4ViT(num_classes=num_classes).to(device)
criterion = YOLOLoss(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_model(model, data_loader, criterion, optimizer, num_epochs=10, device=device)

cuda
Epoch [1/10], Loss: 0.0032
Epoch [2/10], Loss: 0.0070
