### Config

In [85]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import random

In [86]:
import yaml

class Config:
    """
    Loads configuration from the YAML file and provides access to constants.
    """

    def __init__(self, config_path="CNN_Object_Detection/configs/config.yml"):
        with open(config_path, "r") as file:
            self.config = yaml.safe_load(file)

    def get(self, key, default=None):
        """
        Retrieve a value from the YAML config file.

        Args:
            key (str): Configuration key in the format "section.key".
            default: Default value if the key is not found.

        Returns:
            Value from the config file or the default value.
        """
        keys = key.split(".")
        value = self.config
        for k in keys:
            value = value.get(k, {})
            if not isinstance(value, dict):
                return value
        return default

config = Config()

In [87]:
input_shape = tuple(config.get("model.input_shape"))
num_classes = config.get("model.num_classes")
batch_size = config.get("model.batch_size")
epochs = config.get("model.epochs")
lr = config.get("model.learning_rate")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Model

In [88]:
# CNN Model for 640x640 input images
class CNN(nn.Module):
    def __init__(self, input_shape, num_classes):
        super(CNN, self).__init__()
        # (Channels, H, W)
        self.input_shape = input_shape

        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2), # 320x320

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2), # 160x160

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1), # 1x1x64
        )

        # Fully connected heads
        self.classifier = nn.Linear(64, num_classes)
        self.bbox_regressor = nn.Linear(64, 4) # [x_center, y_center, width, height]

    def forward(self, x):
        assert x.shape[1:] == self.input_shape, \
            f"Expected input shape (B, {self.input_shape}), but got {x.shape}"
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten; (batch_size, 64)

        class_logits = self.classifier(x)
        bbox = torch.sigmoid(self.bbox_regressor(x))  # Normalization

        return class_logits, bbox

### Test

In [None]:
model = CNN(input_shape=input_shape, num_classes=10)
image = torch.randn(1, 1, 640, 640)  # Grayscale image [Batch, Channel, Height, Width]
# image = torch.randn(8, 3, 256, 256) # Colour image
class_logits, bbox = model(image)

TypeError: CNN.__init__() missing 1 required positional argument: 'input_shape'

In [None]:
class_logits, bbox

(tensor([[ 0.0935, -0.0707,  0.1901, -0.0494, -0.0031, -0.0744, -0.1042, -0.1014,
          -0.1330, -0.1371]], grad_fn=<AddmmBackward0>),
 tensor([[0.5608, 0.5134, 0.5479, 0.4619]], grad_fn=<SigmoidBackward0>))

### Dataset

In [None]:
class ImageDataset(Dataset):
    def __init__(self, num_samples, input_shape, num_classes):
        self.num_samples = num_samples
        self.input_shape = input_shape
        self.num_classes = num_classes

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image = torch.rand(self.input_shape)
        label = torch.randint(0, self.num_classes, (1,)).item()
        bbox = torch.rand(4)  # [x_center, y_center, width, height]
        return image, label, bbox

### Train

In [None]:
def train(model, dataloader, optimizer, criterion_cls, criterion_bbox):
    model.train()
    total_loss = 0
    for images, labels, bboxes in dataloader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)

        optimizer.zero_grad()
        outputs, pred_bbox = model(images)

        loss_cls = criterion_cls(outputs, labels)
        loss_bbox = criterion_bbox(pred_bbox, bboxes)
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, criterion_cls, criterion_bbox):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in dataloader:
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            outputs, pred_bbox = model(images)

            loss_cls = criterion_cls(outputs, labels)
            loss_bbox = criterion_bbox(pred_bbox, bboxes)
            loss = loss_cls + loss_bbox
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    acc = 100. * correct / total
    return total_loss / len(dataloader), acc

In [None]:
model = CNN(input_shape=input_shape, num_classes=num_classes).to(device)

train_data = ImageDataset(100, input_shape, num_classes)
test_data = ImageDataset(20, input_shape, num_classes)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.SmoothL1Loss()

# Train loop
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion_cls, criterion_bbox)
    val_loss, acc = evaluate(model, test_loader, criterion_cls, criterion_bbox)
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.2f}%")

TypeError: CNN.__init__() got an unexpected keyword argument 'input_shape'

### Infer

In [None]:
def infer(model, image):
    model.eval()
    with torch.no_grad():
        logits, bbox = model(image.unsqueeze(0).to(device))  # Add batch dim
        probs = torch.softmax(logits, dim=1)
        pred_class = torch.argmax(probs)
        return pred_class.item(), probs[0].cpu().numpy(), bbox[0].cpu().numpy()

In [None]:
# Inference test
sample_image, _, _ = test_data[random.randint(0, len(test_data)-1)]
pred_class, probs, bbox = infer(model, sample_image)
print("\n[Inference Result]")
print("Predicted class:", pred_class)
print("Class probabilities:", probs)
print("Bounding box:", bbox)