In [None]:
!pip install torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn as nn

class YOLOv1(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YOLOv1, self).__init__()
        self.S = S
        self.B = B
        self.C = C

        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 64, 7, 2, 3), nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 192, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(192, 128, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            *[nn.Sequential(
                nn.Conv2d(512, 256, 1), nn.LeakyReLU(0.1),
                nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
            ) for _ in range(4)],
            nn.Conv2d(512, 512, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(1024, 512, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 512, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, 2, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * 7 * 7, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5))
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        x = x.view(-1, self.S, self.S, self.C + self.B * 5)
        return x

class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def forward(self, predictions, target):
        N = predictions.size(0)
        coord_mask = target[..., 4] > 0
        noobj_mask = target[..., 4] == 0

        coord_pred = predictions[coord_mask]
        coord_target = target[coord_mask]

        coord_loss = self.mse(coord_pred[..., 0], coord_target[..., 0]) + \
                     self.mse(coord_pred[..., 1], coord_target[..., 1]) + \
                     self.mse(torch.sqrt(torch.abs(coord_pred[..., 2]) + 1e-6), torch.sqrt(torch.abs(coord_target[..., 2]) + 1e-6)) + \
                     self.mse(torch.sqrt(torch.abs(coord_pred[..., 3]) + 1e-6), torch.sqrt(torch.abs(coord_target[..., 3]) + 1e-6))

        obj_loss = self.mse(coord_pred[..., 4], coord_target[..., 4])

        noobj_loss = self.mse(predictions[noobj_mask][..., 4], target[noobj_mask][..., 4])

        class_loss = self.mse(coord_pred[..., 5:], coord_target[..., 5:])

        total_loss = self.lambda_coord * coord_loss + obj_loss + \
                     self.lambda_noobj * noobj_loss + class_loss

        return total_loss / N


In [None]:
model = YOLOv1()
loss_fn = YoloLoss()

# Create dummy inputs and targets
x = torch.randn((2, 3, 448, 448))  # batch of 2 images
output = model(x)  # model prediction

target = torch.randn_like(output)  # dummy target
loss = loss_fn(output, target)  # compute loss

print(f"Output shape: {output.shape}")
print(f"Loss: {loss.item()}")


Output shape: torch.Size([2, 7, 7, 30])
Loss: 906.2088623046875
