**Q6**

The Oxford-IIIT Pet Dataset contains 3,686 images of 37 pet breeds (cats and dogs), each accompanied by a pixel-wise segmentation mask distinguishing pet foreground from background. Unlike simple image classification, semantic segmentation requires per-pixel predictions. Your task is to:

1. Download and preprocess the Oxford-IIIT Pet Dataset (images + annotations).
2. Implement a U-Net (or similarly complex encoder–decoder) architecture from scratch (no prebuilt segmentation libraries).
3. Train the model to segment “pet vs. background” (binary mask) using appropriate loss (e.g., Dice loss, BCE+Dice).
4. Evaluate performance on a held-out test set using Intersection over Union (IoU) and Dice coefficient.
5. Visualize qualitative results (input image + ground-truth mask + predicted mask) for at least five examples.

In [1]:
import torch
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
image_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

mask_transform = transforms.Compose([
    transforms.Resize((128, 128), interpolation=transforms.InterpolationMode.NEAREST),
    transforms.ToTensor(),
    lambda x: (x == (1.0 / 255.0)).float()
])

In [3]:
train_dataset=torchvision.datasets.OxfordIIITPet(root='./Oxford_data', split='trainval',transform=image_transform,
                                                 target_transform=mask_transform, target_types='segmentation', download=True)
test_dataset=torchvision.datasets.OxfordIIITPet(root='./Oxford_data', split='trainval', transform=image_transform,
                                                target_transform=mask_transform,target_types='segmentation', download=True)

100%|██████████| 792M/792M [03:46<00:00, 3.49MB/s]  
100%|██████████| 19.2M/19.2M [00:07<00:00, 2.72MB/s]


In [4]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

In [5]:
learning_rate=0.001
batch_size=4
num_epochs=1


In [6]:
class UNet(nn.Module):
  def __init__(self):
    super(UNet, self).__init__()

    self.conv_e_1=nn.Conv2d(3, 64, kernel_size=3, padding=1)
    self.conv_e_2=nn.Conv2d(64, 64, kernel_size=3, padding=1)
    self.MaxPool2D=nn.MaxPool2d(kernel_size=2, stride=2)

    self.conv_e_3=nn.Conv2d(64,128, kernel_size=3, padding=1)
    self.conv_e_4=nn.Conv2d(128,128, kernel_size=3, padding=1)

    self.conv_e_5=nn.Conv2d(128,256, kernel_size=3, padding=1)
    self.conv_e_6=nn.Conv2d(256,256, kernel_size=3, padding=1)

    self.conv_e_7=nn.Conv2d(256,512, kernel_size=3, padding=1)
    self.conv_e_8=nn.Conv2d(512,512, kernel_size=3, padding=1)

    self.conv_up1=nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
    self.conv_d_1=nn.Conv2d(512, 256, kernel_size=3, padding=1)
    self.conv_d_2=nn.Conv2d(256, 256, kernel_size=3, padding=1)

    self.conv_up2=nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
    self.conv_d_3=nn.Conv2d(256, 128, kernel_size=3, padding=1)
    self.conv_d_4=nn.Conv2d(128, 128, kernel_size=3, padding=1)

    self.conv_up3=nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
    self.conv_d_5=nn.Conv2d(128, 64, kernel_size=3, padding=1)
    self.conv_d_6=nn.Conv2d(64, 64, kernel_size=3, padding=1)
    self.final=nn.Conv2d(64,1,kernel_size=1)

  def forward(self,x):
      #Encoder

      x=F.relu(self.conv_e_1(x))
      x=F.relu(self.conv_e_2(x))
      self.skip_1=x
      x=self.MaxPool2D(x)

      x=F.relu(self.conv_e_3(x))
      x=F.relu(self.conv_e_4(x))
      self.skip_2=x
      x=self.MaxPool2D(x)

      x=F.relu(self.conv_e_5(x))
      x=F.relu(self.conv_e_6(x))
      self.skip_3=x
      x=self.MaxPool2D(x)

      x=F.relu(self.conv_e_7(x))
      x=F.relu(self.conv_e_8(x))

      #Bottleneck
      #What is exactly happening here, couldnt understand, why not using max pooling here?


      #Decoder
      x=self.conv_up1(x)
      x=torch.cat([self.skip_3,x], dim=1)

      x=F.relu(self.conv_d_1(x))
      x=F.relu(self.conv_d_2(x))

      x=self.conv_up2(x)
      x=torch.cat([self.skip_2,x], dim=1)

      x=F.relu(self.conv_d_3(x))
      x=F.relu(self.conv_d_4(x))

      x=self.conv_up3(x)
      x=torch.cat([self.skip_1,x], dim=1)

      x=F.relu(self.conv_d_5(x))
      x=F.relu(self.conv_d_6(x))
      x=torch.sigmoid(self.final(x))

      return x


In [7]:
class DiceBCELoss(nn.Module):
    def __init__(self, smooth=1):
        super(DiceBCELoss, self).__init__()
        self.bce = nn.BCELoss()
        self.smooth = smooth

    def forward(self, inputs, targets):

        inputs = inputs.view(-1)
        targets = targets.view(-1)

        intersection = (inputs * targets).sum()
        dice = (2. * intersection + self.smooth) / (inputs.sum() + targets.sum() + self.smooth)
        dice_loss = 1 - dice


        bce_loss = self.bce(inputs, targets)

        return bce_loss + dice_loss

In [8]:
model = UNet()
criterion = DiceBCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets.float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


PicklingError: Can't pickle <function <lambda> at 0x000001DAFA483420>: attribute lookup <lambda> on __main__ failed

In [None]:
inputs, targets = next(iter(test_loader))
with torch.no_grad():
    outputs = model(inputs)