In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image

from torch.utils.data import Dataset, DataLoader

import cv2
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm


In [2]:
# import zipfile

# ZIP_PATH = "./face images.zip"

# with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
#     zip_ref.extractall("./")

# print("Dataset extracted successfully!")


In [3]:
DATA_DIR ='/work/sharedrive/Lab/khanjan.damani/Car Detection Using Pytorch CNN'
IMG_DIR = os.path.join(DATA_DIR, "images")
ANN_DIR = os.path.join(DATA_DIR, "faces.csv")

# Create directories if they don't exist
for path in [IMG_DIR, ANN_DIR]:
    if not os.path.exists(path):
        print(f"Warning: {path} not found. Check your dataset extraction.")

In [4]:
# Cell 2: Face Detection Dataset (FINAL – STABLE & DETECTION-SAFE)

import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image, UnidentifiedImageError


class FaceDetectionDataset(Dataset):
    def __init__(self, images_dir, csv_file, transform=None):
        self.images_dir = images_dir
        self.df = pd.read_csv(csv_file)
        self.transform = transform

        # Remove invalid rows
        self.df = self.df.dropna()

        # Unique image names
        self.image_names = self.df["image_name"].unique()

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        image_name = self.image_names[idx]
        image_path = os.path.join(self.images_dir, image_name)

        # -------------------------------
        # Safe image loading
        # -------------------------------
        try:
            image = Image.open(image_path).convert("RGB")
        except (UnidentifiedImageError, FileNotFoundError):
            # Neutral dummy sample (VERY IMPORTANT FIX)
            dummy_image = torch.zeros(3, 224, 224)
            dummy_bbox = torch.tensor(
                [0.5, 0.5, 0.5, 0.5], dtype=torch.float32
            )
            return dummy_image, dummy_bbox

        # -------------------------------
        # Read annotation
        # -------------------------------
        row = self.df[self.df["image_name"] == image_name].iloc[0]

        img_w = row["width"]
        img_h = row["height"]

        x0 = row["x0"] / img_w
        y0 = row["y0"] / img_h
        x1 = row["x1"] / img_w
        y1 = row["y1"] / img_h

        # Clamp bounding box to [0, 1]
        bbox = torch.tensor(
            [
                max(0.0, min(x0, 1.0)),
                max(0.0, min(y0, 1.0)),
                max(0.0, min(x1, 1.0)),
                max(0.0, min(y1, 1.0)),
            ],
            dtype=torch.float32
        )

        # -------------------------------
        # Apply transforms
        # -------------------------------
        if self.transform:
            image = self.transform(image)

        return image, bbox


In [5]:
# Cell 3: Transforms and DataLoader
import torchvision.transforms as T

# Image size for the CNN
IMG_SIZE = 224
BATCH_SIZE = 64

# Image transformations
train_transform = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# Dataset
train_dataset = FaceDetectionDataset(
    images_dir=IMG_DIR,
    csv_file=ANN_DIR,
    transform=train_transform
)

# DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

print("Total training images:", len(train_dataset))


Total training images: 2204


In [6]:
# Cell 4: CNN Backbone for Face Detection (UPDATED)

import torch
import torch.nn as nn


class FaceCNN(nn.Module):
    def __init__(self):
        super(FaceCNN, self).__init__()

        # -------------------------------
        # CNN Backbone
        # -------------------------------
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 224 -> 112

            # Block 2
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 112 -> 56

            # Block 3
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 56 -> 28

            # Block 4
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # 28 -> 14
        )

        # Global pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))

        # -------------------------------
        # Bounding Box Regressor (IMPORTANT)
        # -------------------------------
        self.bbox_head = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(inplace=True),

            nn.Linear(256, 128),
            nn.ReLU(inplace=True),

            nn.Linear(128, 4),
            nn.Sigmoid()   # CRITICAL: outputs in [0,1]
        )

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)  # [B, 256]
        bbox = self.bbox_head(x)
        return bbox


In [7]:
# Cell 5: Loss function and optimizer

import torch
import torch.nn as nn
import torch.optim as optim

# Device setup (GPU-safe)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Initialize model
model = FaceCNN().to(device)

# Bounding box regression loss
criterion = nn.SmoothL1Loss(beta=0.5)

# Optimizer
optimizer = optim.Adam(
    model.parameters(),
    lr=2e-4,
    weight_decay=1e-5
)

print("Model, loss function, and optimizer initialized")


Using device: cuda
Model, loss function, and optimizer initialized


In [None]:
# Cell 6: Training loop

EPOCHS = 50

model.train()

for epoch in range(EPOCHS):
    epoch_loss = 0.0

    for images, boxes in train_loader:
        images = images.to(device)
        boxes = boxes.to(device)

        # Forward pass
        preds = model(images)

        # Compute loss
        loss = criterion(preds, boxes)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] - Loss: {avg_loss:.4f}")


In [None]:
# Detection Accuracy using IoU

def compute_iou(box1, box2):
    """
    box format: [x1, y1, x2, y2] (normalized 0–1)
    """
    x1 = torch.max(box1[0], box2[0])
    y1 = torch.max(box1[1], box2[1])
    x2 = torch.min(box1[2], box2[2])
    y2 = torch.min(box1[3], box2[3])

    inter = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)

    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

    union = area1 + area2 - inter

    return inter / union if union > 0 else torch.tensor(0.0)


def detection_accuracy(model, dataloader, device, iou_threshold=0.5):
    model.eval()
    correct = 0
    total = 0
    iou_sum = 0.0

    with torch.no_grad():
        for images, gt_boxes in dataloader:
            images = images.to(device)
            gt_boxes = gt_boxes.to(device)

            preds = model(images)

            for i in range(images.size(0)):
                iou = compute_iou(preds[i], gt_boxes[i])
                iou_sum += iou.item()

                if iou >= iou_threshold:
                    correct += 1
                total += 1

    avg_iou = iou_sum / total
    acc = correct / total

    return acc, avg_iou


In [None]:
acc, avg_iou = detection_accuracy(
    model,
    train_loader,
    device,
    iou_threshold=0.5
)

print(f"Training Detection Accuracy: {acc*100:.2f}%")
print(f"Average IoU: {avg_iou:.3f}")


In [None]:
# Cell 7: Visualize predictions

import matplotlib.pyplot as plt
import torchvision.transforms.functional as F

model.eval()

def visualize_predictions(dataset, model, device, num_samples=5):
    plt.figure(figsize=(15, 5))

    for i in range(num_samples):
        image, gt_box = dataset[i]

        input_img = image.unsqueeze(0).to(device)

        with torch.no_grad():
            pred_box = model(input_img).cpu().squeeze(0)

        # Convert tensor image to numpy
        img_np = image.permute(1, 2, 0).cpu().numpy()
        img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min())

        H, W, _ = img_np.shape

        # Ground truth bbox
        gt = gt_box.numpy()
        gt_x1, gt_y1 = int(gt[0] * W), int(gt[1] * H)
        gt_x2, gt_y2 = int(gt[2] * W), int(gt[3] * H)

        # Predicted bbox
        pb = pred_box.numpy()
        pb_x1, pb_y1 = int(pb[0] * W), int(pb[1] * H)
        pb_x2, pb_y2 = int(pb[2] * W), int(pb[3] * H)

        ax = plt.subplot(1, num_samples, i + 1)
        ax.imshow(img_np)

        # Ground truth (GREEN)
        ax.add_patch(
            plt.Rectangle(
                (gt_x1, gt_y1),
                gt_x2 - gt_x1,
                gt_y2 - gt_y1,
                fill=False,
                edgecolor="green",
                linewidth=2,
                label="GT"
            )
        )

        # Prediction (RED)
        ax.add_patch(
            plt.Rectangle(
                (pb_x1, pb_y1),
                pb_x2 - pb_x1,
                pb_y2 - pb_y1,
                fill=False,
                edgecolor="red",
                linewidth=2,
                label="Pred"
            )
        )

        ax.axis("off")

    plt.show()


# Run visualization
visualize_predictions(train_dataset, model, device)
