In [None]:
import os
import shutil
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms, datasets
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Sun Mar  2 20:46:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
class LambdaLayer(nn.Module):
    """
    This class defines a Lambda Layer. It allows to perform arbitrary operations specified by the "lambd" argument.

    Attributes:
        lambd: a function that defines the operation to be performed on the input.
    """

    def __init__(self, lambd):
        """
        Init method for the Lambda Layer.

        Args:
            lambd (function): Function that defines the operation to be performed on the input.
        """
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        """
        Forward pass of the Lambda Layer. It applies the function to the input.

        Args:
            x (torch.Tensor): Input tensor to the Lambda Layer.

        Returns:
            torch.Tensor: The output of the Lambda Layer after applying the function.
        """
        return self.lambd(x)

class BasicConvBlock(nn.Module):

    ''' The BasicConvBlock takes an input with in_channels, applies some blocks of convolutional layers
    to reduce it to out_channels and sum it up to the original input.
    If their sizes mismatch, then the input goes into an identity.

    Basically The BasicConvBlock will implement the regular basic Conv Block +
    the shortcut block that does the dimension matching job (option A or B) when dimension changes between 2 blocks
    '''

    def __init__(self, in_channels, out_channels, stride=1, option='A'):
        """
        Init method for the Basic Convolution Block.

        Args:
            in_channels (int): Number of channels in the input tensor.
            out_channels (int): Number of channels in the output tensor.
            stride (int, optional): Stride for the convolution operation. Default is 1.
            option (str, optional): Option for the shortcut connection to match dimensions. Default is 'A'.
        """
        super(BasicConvBlock, self).__init__()

        self.features = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)),
            ('bn1', nn.BatchNorm2d(out_channels)),
            ('act1', nn.ReLU()),
            ('conv2', nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)),
            ('bn2', nn.BatchNorm2d(out_channels))
        ]))

        self.shortcut = nn.Sequential()

        '''  When input and output spatial dimensions don't match, we have 2 options, with stride:
            - A) Use identity shortcuts with zero padding to increase channel dimension.
            - B) Use 1x1 convolution to increase channel dimension (projection shortcut).
         '''
        if stride != 1 or in_channels != out_channels:
            if option == 'A':
                # Use identity shortcuts with zero padding to increase channel dimension.
                pad_to_add = out_channels//4
                ''' ::2 is doing the job of stride = 2
                F.pad apply padding to (W,H,C,N).

                The padding lengths are specified in reverse order of the dimensions,
                F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad,pad, 0,0))

                [width_beginning, width_end, height_beginning, height_end, channel_beginning, channel_end, batchLength_beginning, batchLength_end ]

                '''
                self.shortcut = LambdaLayer(lambda x:
                            F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad_to_add, pad_to_add, 0,0)))
            if option == 'B':
                self.shortcut = nn.Sequential(OrderedDict([
                    ('s_conv1', nn.Conv2d(in_channels, 2*out_channels, kernel_size=1, stride=stride, padding=0, bias=False)),
                    ('s_bn1', nn.BatchNorm2d(2*out_channels))
                ]))

    def forward(self, x):
        """
        Forward pass of the Basic Convolution Block. It applies the sequence of layers and adds the shortcut connection.

        Args:
            x (torch.Tensor): Input tensor to the Basic Convolution Block.

        Returns:
            torch.Tensor: The output of the Basic Convolution Block.
        """
        out = self.features(x)
        # sum it up with shortcut layer
        out += self.shortcut(x)
        out = F.relu(out)
        return out



### Explanations on using Option A and B in below code

```py

if stride != 1 or in_channels != out_channels:
            if option == 'A':
                pad = out_channels//4
                # ::2 replace the stride 2 + F.pad apply padding to (W,H,C,N).
                self.shortcut = LambdaLayer(lambda x:
                            F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad,pad, 0,0)))
            if option == 'B':
                self.shortcut = nn.Sequential(OrderedDict([
                    ('s_conv1', nn.Conv2d(in_channels, 2*out_channels, kernel_size=1, stride=stride, padding=0, bias=False)),
                    ('s_bn1', nn.BatchNorm2d(2*out_channels))
                ]))

```

As per the original Paper

#### We use identity shortcuts when input and output channel dimensions are the same.

#### Otherwise, When input and output spatial dimensions don't match, we have 2 options, with stride:

    - A) Use identity shortcuts with zero padding to increase channel dimension.

    - B) Use 1x1 convolution to increase channel dimension (projection shortcut).

-----------------------

### Understanding `F.pad` on a 4-D Tensor and the following line

### `F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad,pad, 0,0)))`

https://stackoverflow.com/a/61945903/1902852

The padding lengths are specified in reverse order of the dimensions, where every dimension has two values, one for the padding at the beginning and one for the padding at the end.

For an image with the dimensions `[channels, height, width]` the padding is given as:

`[width_beginning, width_end, height_beginning, height_end, channels_beginning, channels_end]`,

which can be reworded to

`[left, right, top, bottom]`

Therefore the code above pads the images to the right and bottom. The channels are left out, because they are not being padded, which also means that the same padding could be directly applied to the masks.

So the below line means

`F.pad(x[:, :, ::2, ::2], (0,0, 0,0, pad,pad, 0,0))`


`[width_beginning, width_end, height_beginning, height_end, channel_beginning, channel_end, batchLength_beginning, batchLength_end ]`

In [None]:

class ResNet(nn.Module):
    """ ResNet-56 architecture for CIFAR-10 Dataset of shape 32*32*3.

    Args:
        block_type (nn.Module): The type of residual block to use.
        num_blocks (list): List containing the number of blocks for each layer.

    Attributes:
        in_channels (int): Number of input channels.
        conv0 (nn.Conv2d): Initial convolutional layer.
        bn0 (nn.BatchNorm2d): Batch normalization layer.
        block1 (nn.Sequential): First block layer.
        block2 (nn.Sequential): Second block layer.
        block3 (nn.Sequential): Third block layer.
        avgpool (nn.AdaptiveAvgPool2d): Adaptive average pooling layer.
        linear (nn.Linear): Linear layer for classification. """
    def __init__(self, block_type, num_blocks):
        super(ResNet, self).__init__()

        self.in_channels = 32

        self.conv0 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn0 = nn.BatchNorm2d(32)

        self.block1 = self.__build_layer(block_type, 32, num_blocks[0], starting_stride=1)

        self.block2 = self.__build_layer(block_type, 64, num_blocks[1], starting_stride=2)

        self.block3 = self.__build_layer(block_type, 128, num_blocks[2], starting_stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.linear = nn.Linear(128, 10)

    def __build_layer(self, block_type, out_channels, num_blocks, starting_stride):
        """
        Build a layer consisting of multiple residual blocks.

        Args:
            block_type (nn.Module): The type of residual block to use.
            out_channels (int): Number of output channels.
            num_blocks (int): Number of blocks in the layer.
            starting_stride (int): Stride value for the first block.

        Returns:
            nn.Sequential: Sequential container of the residual blocks.
        """

        strides_list_for_current_block = [starting_stride] + [1]*(num_blocks-1)
        ''' Above line will generate an array whose first element is starting_stride
        And it will have (num_blocks-1) more elements each of value 1
         '''
        # print('strides_list_for_current_block ', strides_list_for_current_block)

        layers = []

        for stride in strides_list_for_current_block:
            layers.append(block_type(self.in_channels, out_channels, stride))
            self.in_channels = out_channels

        return nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass of the ResNet model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor.
        """
        out = F.relu(self.bn0(self.conv0(x)))
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.linear(out)
        return out

### _build_layer() method

In ResNet Every layer downsamples the input at the start using stride equals to 2 i.e for 1st convolutional layer in 1st block of a layer.

If we look at the first operation of each layer, we see that the stride used at that first one is 2, instead of 1 like for the rest of them.

This is because, here in ResNet, reduction between layers is achieved by an increase on the stride, from 1 to 2, at the first convolution of each layer; instead of by a pooling operation, which we are used to see as down samplers.

Quoting from Paper

" For both options, when the shortcuts go across feature maps of two sizes, they are performed with a stride of 2."

In [None]:
def ResNet56():
    return ResNet(block_type=BasicConvBlock, num_blocks=[12,12,12])

In [None]:
model = ResNet56()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = 'cpu'
model.to(device)
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 32, 32]             864
       BatchNorm2d-2           [-1, 32, 32, 32]              64
            Conv2d-3           [-1, 32, 32, 32]           9,216
       BatchNorm2d-4           [-1, 32, 32, 32]              64
              ReLU-5           [-1, 32, 32, 32]               0
            Conv2d-6           [-1, 32, 32, 32]           9,216
       BatchNorm2d-7           [-1, 32, 32, 32]              64
    BasicConvBlock-8           [-1, 32, 32, 32]               0
            Conv2d-9           [-1, 32, 32, 32]           9,216
      BatchNorm2d-10           [-1, 32, 32, 32]              64
             ReLU-11           [-1, 32, 32, 32]               0
           Conv2d-12           [-1, 32, 32, 32]           9,216
      BatchNorm2d-13           [-1, 32, 32, 32]              64
   BasicConvBlock-14           [-1, 32,

## Loading CIFAR-10 Dataset

In [None]:
def dataloader_cifar():
    """
    Create dataloaders for the CIFAR-10 dataset.

    Returns:
        train_loader (torch.utils.data.DataLoader): Dataloader for the training set.
        val_loader (torch.utils.data.DataLoader): Dataloader for the validation set.
        test_loader (torch.utils.data.DataLoader): Dataloader for the test set.
    """
    transform = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.5], std=[0.5])])

    # Input Data in Local Machine
    # train_dataset = datasets.CIFAR10('../input_data', train=True, download=True, transform=transform)
    # test_dataset = datasets.CIFAR10('../input_data', train=False, download=True, transform=transform)

    # Input Data in Google Drive
    train_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=False, download=True, transform=transform)

    # Split dataset into training set and validation set.
    train_dataset, val_dataset = random_split(train_dataset, (45000, 5000))

    print("Image shape of a random sample image : {}".format(train_dataset[0][0].numpy().shape), end = '\n\n')

    print("Training Set:   {} images".format(len(train_dataset)))
    print("Validation Set:   {} images".format(len(val_dataset)))
    print("Test Set:       {} images".format(len(test_dataset)))

    BATCH_SIZE = 32

    # Generate dataloader
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=10000, shuffle=True)

    return train_loader, val_loader, test_loader

In [None]:
train_loader, val_loader, test_loader = dataloader_cifar()

Files already downloaded and verified
Files already downloaded and verified
Image shape of a random sample image : (3, 32, 32)

Training Set:   45000 images
Validation Set:   5000 images
Test Set:       10000 images


## Start Actual Training

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001,momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60], gamma=0.1)


In [None]:
def train_model():
    EPOCHS = 25
    train_samples_num = 45000
    val_samples_num = 5000
    train_costs, val_costs = [], []

    #Training phase.
    for epoch in range(EPOCHS):

        train_running_loss = 0
        correct_train = 0

        model.train().cuda()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            """ for every mini-batch during the training phase, we typically want to explicitly set the gradients
            to zero before starting to do backpropragation """
            optimizer.zero_grad()

            # Start the forward pass
            prediction = model(inputs)

            loss = criterion(prediction, labels)

            # do backpropagation and update weights with step()
            loss.backward()
            optimizer.step()

            # print('outputs on which to apply torch.max ', prediction)
            # find the maximum along the rows, use dim=1 to torch.max()
            _, predicted_outputs = torch.max(prediction.data, 1)

            # Update the running corrects
            correct_train += (predicted_outputs == labels).float().sum().item()

            ''' Compute batch loss
            multiply each average batch loss with batch-length.
            The batch-length is inputs.size(0) which gives the number total images in each batch.
            Essentially I am un-averaging the previously calculated Loss '''
            train_running_loss += (loss.data.item() * inputs.shape[0])


        train_epoch_loss = train_running_loss / train_samples_num

        train_costs.append(train_epoch_loss)

        train_acc =  correct_train / train_samples_num

        # Now check trained weights on the validation set
        val_running_loss = 0
        correct_val = 0

        model.eval().cuda()

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass.
                prediction = model(inputs)

                # Compute the loss.
                loss = criterion(prediction, labels)

                # Compute validation accuracy.
                _, predicted_outputs = torch.max(prediction.data, 1)
                correct_val += (predicted_outputs == labels).float().sum().item()

            # Compute batch loss.
            val_running_loss += (loss.data.item() * inputs.shape[0])

            val_epoch_loss = val_running_loss / val_samples_num
            val_costs.append(val_epoch_loss)
            val_acc =  correct_val / val_samples_num

        info = "[Epoch {}/{}]: train-loss = {:0.6f} | train-acc = {:0.3f} | val-loss = {:0.6f} | val-acc = {:0.3f}"

        print(info.format(epoch+1, EPOCHS, train_epoch_loss, train_acc, val_epoch_loss, val_acc))

        torch.save(model.state_dict(), '/content/checkpoint_gpu_{}'.format(epoch + 1))

    torch.save(model.state_dict(), '/content/resnet-56_weights_gpu')

    return train_costs, val_costs



In [None]:
# !pwd
train_costs, val_costs = train_model()

[Epoch 1/25]: train-loss = 0.470197 | train-acc = 0.837 | val-loss = 0.000912 | val-acc = 0.749
[Epoch 2/25]: train-loss = 0.386259 | train-acc = 0.867 | val-loss = 0.002649 | val-acc = 0.780
[Epoch 3/25]: train-loss = 0.317335 | train-acc = 0.888 | val-loss = 0.000544 | val-acc = 0.745
[Epoch 4/25]: train-loss = 0.244087 | train-acc = 0.914 | val-loss = 0.000928 | val-acc = 0.754
[Epoch 5/25]: train-loss = 0.199033 | train-acc = 0.929 | val-loss = 0.001656 | val-acc = 0.787
[Epoch 6/25]: train-loss = 0.161549 | train-acc = 0.943 | val-loss = 0.002023 | val-acc = 0.759
[Epoch 7/25]: train-loss = 0.135564 | train-acc = 0.953 | val-loss = 0.001688 | val-acc = 0.787
[Epoch 8/25]: train-loss = 0.110733 | train-acc = 0.961 | val-loss = 0.002326 | val-acc = 0.780
[Epoch 9/25]: train-loss = 0.098684 | train-acc = 0.965 | val-loss = 0.005426 | val-acc = 0.797
[Epoch 10/25]: train-loss = 0.072463 | train-acc = 0.975 | val-loss = 0.001094 | val-acc = 0.790
[Epoch 11/25]: train-loss = 0.059941 | 

In [None]:
#Restore the model.
model = ResNet56()
model.load_state_dict(torch.load('/content/resnet-56_weights_gpu'))

  model.load_state_dict(torch.load('/content/resnet-56_weights_gpu'))


<All keys matched successfully>

## Test the trained model on Test dataset

In [None]:
import pickle

# Inspect the .pkl file structure
with open('/content/cifar_test_nolabel.pkl', 'rb') as f:
    data = pickle.load(f)
    print("Keys in .pkl file:", data.keys())  # For dictionaries
    print("Type of data:", type(data))        # For non-dictionary structures


Keys in .pkl file: dict_keys([b'data', b'ids'])
Type of data: <class 'dict'>


In [None]:
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class CustomTestDataset(Dataset):
    def __init__(self, pkl_path, transform=None):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)

        # Access keys as byte strings (b'...')
        self.ids = data[b'ids']             # IDs from the .pkl file
        self.images = data[b'data']         # Image data
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        # CIFAR-10 images are stored as (3072,) flat arrays; reshape to 32x32x3
        image = self.images[idx].reshape(3, 32, 32).transpose(1, 2, 0)
        if self.transform:
            image = self.transform(image)
        return image, self.ids[idx]

# Use the same transforms as training (3-channel normalization)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Load custom dataset
custom_dataset = CustomTestDataset('/content/cifar_test_nolabel.pkl', transform=transform)
test_loader = DataLoader(custom_dataset, batch_size=32, shuffle=False)

In [None]:
model.eval().cuda()
all_ids = []
all_preds = []

with torch.no_grad():
    for inputs, ids in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_ids.extend(ids.cpu().numpy())  # Convert byte IDs to integers if needed
        all_preds.extend(preds.cpu().numpy())

# Save to CSV
import pandas as pd
df = pd.DataFrame({'ID': all_ids, 'Labels': all_preds})
df.to_csv('submission.csv', index=False)