# Convolutional Blocks of ResNet18

## 0. Imports

In [None]:
from typing import Literal

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import wandb
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet18

In [None]:
PROJECT_NAME = 'cv-s25-a2-resnet18'
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class ClassificationMeasures:
    """ Computes common evaluation measures for classification based tasks. """

    def __init__(self, y_true, y_pred):
        """ Initializes the class to compute on given data.

        Args:
            y_true: Array containing true values.
            y_pred: Array containing predicted values.
        """

        # Store the passed arguments
        self.y_true = y_true
        self.y_pred = y_pred

        # Initialize the classes based on y values
        self.classes = np.unique(np.concatenate((y_true, y_pred)))
        self.num_classes = self.classes.shape[0]

        # Initialize the confusion matrices to None
        self.confusion_matrices = None

    def accuracy_score(self):
        """ Computes the accuracy. """

        return np.mean(self.y_true == self.y_pred)

    def _compute_confusion_matrices(self):
        """ Computes the confusion matrices for each class. """

        confusion_matrices = np.empty((self.num_classes, 2, 2))

        # Fill the confusion matrix for each classes
        for idx, clx in enumerate(self.classes):
            # True positive
            confusion_matrices[idx, 0, 0] = np.sum((self.y_true == clx) & (self.y_pred == clx))
            # False positive
            confusion_matrices[idx, 0, 1] = np.sum((self.y_true != clx) & (self.y_pred == clx))
            # False negative
            confusion_matrices[idx, 1, 0] = np.sum((self.y_true == clx) & (self.y_pred != clx))
            # True negative
            confusion_matrices[idx, 1, 1] = np.sum((self.y_true != clx) & (self.y_pred != clx))

        return confusion_matrices

    def confusion_matrix(self):
        """ Computes the overall confusion matrix. """

        confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype=int)

        for true_idx, pred_idx in zip(self.y_true, self.y_pred):
            confusion_matrix[true_idx, pred_idx] += 1

        return confusion_matrix

    def f1_score(self, average: Literal['micro', 'macro']):
        """ Computes the f1 score. """

        # Validate the passed arguments
        assert average in ['micro', 'macro'], f'Unrecognized argument for average {average}'

        # Compute recall and precision with same method
        recall = self.recall_score(average)
        precision = self.precision_score(average)

        # Compute the F1 score
        f1 = 2 * recall * precision / (recall + precision)
        return f1

    def recall_score(self, average: Literal['micro', 'macro']):
        """ Computes the recall. """

        # Validate the passed arguments
        assert average in ['micro', 'macro'], f'Unrecognized argument for average {average}'

        # Compute confusion matrix for each class
        if self.confusion_matrices is None:
            self.confusion_matrices = self._compute_confusion_matrices()

        if average == 'micro':
            # Compute recall of pooled confusion matrix
            pooled_confusion_matrix = np.sum(self.confusion_matrices, axis=0)
            recall = pooled_confusion_matrix[0, 0] / \
                            (pooled_confusion_matrix[0, 0] + pooled_confusion_matrix[0, 1])

        elif average == 'macro':
            # Compute average over recall of individual classes
            recall = 0
            for idx in range(self.num_classes):
                denom = self.confusion_matrices[idx, 0, 0] + self.confusion_matrices[idx, 0, 1]
                if denom != 0:
                    recall += (self.confusion_matrices[idx, 0, 0] / denom)
                else:
                    recall += 1
            recall /= self.num_classes

        return recall

    def precision_score(self, average: Literal['micro', 'macro']):
        """ Computes the precision. """

        # Validate the passed arguments
        assert average in ['micro', 'macro'], f'Unrecognized argument for average {average}'

        # Compute confusion matrix for each class
        if self.confusion_matrices is None:
            self.confusion_matrices = self._compute_confusion_matrices()

        if average == 'micro':
            # Compute precision of pooled confusion matrix
            pooled_confusion_matrix = np.sum(self.confusion_matrices, axis=0)
            precision = pooled_confusion_matrix[0, 0] / \
                                (pooled_confusion_matrix[0, 0] + pooled_confusion_matrix[1, 0])

        elif average == 'macro':
            # Compute average over precision of individual classes
            precision = 0
            for idx in range(self.num_classes):
                denom = self.confusion_matrices[idx, 0, 0] + self.confusion_matrices[idx, 1, 0]
                if denom != 0:
                    precision += (self.confusion_matrices[idx, 0, 0] / denom)
                else:
                    precision += 1
            precision /= self.num_classes

        return precision

    def print_all_measures(self):
        """ Evaluates and prints all the measures. """

        print('Accuracy:', self.accuracy_score())
        print('Precision (Micro):', self.precision_score(average='micro'))
        print('Recall (Micro):', self.recall_score(average='micro'))
        print('F1 Score (Micro):', self.f1_score(average='micro'))
        print('Precision (Macro):', self.precision_score(average='macro'))
        print('Recall (Macro):', self.recall_score(average='macro'))
        print('F1 Score (Macro):', self.f1_score(average='macro'))

def train_model(model, train_loader, test_loader, model_name, epochs=10, lr=0.001):

    wandb.init(project=PROJECT_NAME, name=model_name, reinit=True)

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct, total = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100 * correct / total
        test_acc = evaluate_model(model, test_loader)

        print(f"Epoch {epoch+1}: Loss: {running_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
        wandb.log({"Loss": running_loss, "Train Accuracy": train_acc, "Test Accuracy": test_acc})

    y_true = []
    y_pred = []
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            y_true.append(labels)
            y_pred.append(predicted)
    y_true = torch.hstack(y_true).cpu().numpy()
    y_pred = torch.hstack(y_pred).cpu().numpy()

    f1_score = ClassificationMeasures(y_true, y_pred).f1_score(average='macro')
    confusion_matrix = wandb.plot.confusion_matrix(y_true=y_true, preds=y_pred)
    wandb.log({"F1 Score": f1_score, "Confusion Matrix": confusion_matrix})

    wandb.finish()
    model = model.to('cpu')

def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    return 100 * correct / total

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = self.data[idx].float()
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

train_data = torch.load("../data/q1/train_data.pt")
train_labels = torch.load("../data/q1/train_labels.pt")

test_data = torch.load("../data/q1/test_data.pt")
test_labels = torch.load("../data/q1/test_labels.pt")

## 1. Baseline - Training ResNet 

In [None]:
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation((-7,7)),
    transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)),
    transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261)),
])
test_transform = transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261))

train_dataset = CustomDataset(train_data, train_labels, transform=train_transform)
test_dataset = CustomDataset(test_data, test_labels, transform=test_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = resnet18(num_classes=10)
train_model(model, train_loader, test_loader, 'scratch')

In [None]:
pretrained_model = resnet18(weights='DEFAULT')
pretrained_model.fc = nn.Linear(512, 10)
train_model(pretrained_model, train_loader, test_loader, 'pretrained')

<img src="train_loss_1.png" alt="Train Loss" width="450"/>
<img src="train_acc_1.png" alt="Train Acc" width="450"/>
<img src="test_acc_1.png" alt="Test Acc" width="450"/>
<img src="f1_score_1.png" alt="F1 Score" width="450"/>
<img src="confusion_matrix_1.png" alt="Confusion Matrix" width="600"/>

**Qn. What are the spatial dimensions of image after each layer/block? What are these dimensions, in the layer just before average pooling?**

**Answer:**

- We can use the ``torchinfo`` package to get this information.

- The input spatial dimension (36, 36) gets reduced to (18, 18) after the first convolution. Maxpool further reduces it to (9, 9).

- The sequence of convolutional blocks successively reduce this to half, giving 512 (2, 2) activation maps.

- The average pool and (flattening) converts this to a 512 length vector.

```
===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
ResNet                                   [1, 3, 36, 36]            [1, 1000]                 --
├─Conv2d: 1-1                            [1, 3, 36, 36]            [1, 64, 18, 18]           9,408
├─BatchNorm2d: 1-2                       [1, 64, 18, 18]           [1, 64, 18, 18]           128
├─ReLU: 1-3                              [1, 64, 18, 18]           [1, 64, 18, 18]           --
├─MaxPool2d: 1-4                         [1, 64, 18, 18]           [1, 64, 9, 9]             --
├─Sequential: 1-5                        [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    └─BasicBlock: 2-1                   [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    │    └─Conv2d: 3-1                  [1, 64, 9, 9]             [1, 64, 9, 9]             36,864
│    │    └─BatchNorm2d: 3-2             [1, 64, 9, 9]             [1, 64, 9, 9]             128
│    │    └─ReLU: 3-3                    [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    │    └─Conv2d: 3-4                  [1, 64, 9, 9]             [1, 64, 9, 9]             36,864
│    │    └─BatchNorm2d: 3-5             [1, 64, 9, 9]             [1, 64, 9, 9]             128
│    │    └─ReLU: 3-6                    [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    └─BasicBlock: 2-2                   [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    │    └─Conv2d: 3-7                  [1, 64, 9, 9]             [1, 64, 9, 9]             36,864
│    │    └─BatchNorm2d: 3-8             [1, 64, 9, 9]             [1, 64, 9, 9]             128
│    │    └─ReLU: 3-9                    [1, 64, 9, 9]             [1, 64, 9, 9]             --
│    │    └─Conv2d: 3-10                 [1, 64, 9, 9]             [1, 64, 9, 9]             36,864
│    │    └─BatchNorm2d: 3-11            [1, 64, 9, 9]             [1, 64, 9, 9]             128
│    │    └─ReLU: 3-12                   [1, 64, 9, 9]             [1, 64, 9, 9]             --
├─Sequential: 1-6                        [1, 64, 9, 9]             [1, 128, 5, 5]            --
│    └─BasicBlock: 2-3                   [1, 64, 9, 9]             [1, 128, 5, 5]            --
│    │    └─Conv2d: 3-13                 [1, 64, 9, 9]             [1, 128, 5, 5]            73,728
│    │    └─BatchNorm2d: 3-14            [1, 128, 5, 5]            [1, 128, 5, 5]            256
│    │    └─ReLU: 3-15                   [1, 128, 5, 5]            [1, 128, 5, 5]            --
│    │    └─Conv2d: 3-16                 [1, 128, 5, 5]            [1, 128, 5, 5]            147,456
│    │    └─BatchNorm2d: 3-17            [1, 128, 5, 5]            [1, 128, 5, 5]            256
│    │    └─Sequential: 3-18             [1, 64, 9, 9]             [1, 128, 5, 5]            8,448
│    │    └─ReLU: 3-19                   [1, 128, 5, 5]            [1, 128, 5, 5]            --
│    └─BasicBlock: 2-4                   [1, 128, 5, 5]            [1, 128, 5, 5]            --
│    │    └─Conv2d: 3-20                 [1, 128, 5, 5]            [1, 128, 5, 5]            147,456
│    │    └─BatchNorm2d: 3-21            [1, 128, 5, 5]            [1, 128, 5, 5]            256
│    │    └─ReLU: 3-22                   [1, 128, 5, 5]            [1, 128, 5, 5]            --
│    │    └─Conv2d: 3-23                 [1, 128, 5, 5]            [1, 128, 5, 5]            147,456
│    │    └─BatchNorm2d: 3-24            [1, 128, 5, 5]            [1, 128, 5, 5]            256
│    │    └─ReLU: 3-25                   [1, 128, 5, 5]            [1, 128, 5, 5]            --
├─Sequential: 1-7                        [1, 128, 5, 5]            [1, 256, 3, 3]            --
│    └─BasicBlock: 2-5                   [1, 128, 5, 5]            [1, 256, 3, 3]            --
│    │    └─Conv2d: 3-26                 [1, 128, 5, 5]            [1, 256, 3, 3]            294,912
│    │    └─BatchNorm2d: 3-27            [1, 256, 3, 3]            [1, 256, 3, 3]            512
│    │    └─ReLU: 3-28                   [1, 256, 3, 3]            [1, 256, 3, 3]            --
│    │    └─Conv2d: 3-29                 [1, 256, 3, 3]            [1, 256, 3, 3]            589,824
│    │    └─BatchNorm2d: 3-30            [1, 256, 3, 3]            [1, 256, 3, 3]            512
│    │    └─Sequential: 3-31             [1, 128, 5, 5]            [1, 256, 3, 3]            33,280
│    │    └─ReLU: 3-32                   [1, 256, 3, 3]            [1, 256, 3, 3]            --
│    └─BasicBlock: 2-6                   [1, 256, 3, 3]            [1, 256, 3, 3]            --
│    │    └─Conv2d: 3-33                 [1, 256, 3, 3]            [1, 256, 3, 3]            589,824
│    │    └─BatchNorm2d: 3-34            [1, 256, 3, 3]            [1, 256, 3, 3]            512
│    │    └─ReLU: 3-35                   [1, 256, 3, 3]            [1, 256, 3, 3]            --
│    │    └─Conv2d: 3-36                 [1, 256, 3, 3]            [1, 256, 3, 3]            589,824
│    │    └─BatchNorm2d: 3-37            [1, 256, 3, 3]            [1, 256, 3, 3]            512
│    │    └─ReLU: 3-38                   [1, 256, 3, 3]            [1, 256, 3, 3]            --
├─Sequential: 1-8                        [1, 256, 3, 3]            [1, 512, 2, 2]            --
│    └─BasicBlock: 2-7                   [1, 256, 3, 3]            [1, 512, 2, 2]            --
│    │    └─Conv2d: 3-39                 [1, 256, 3, 3]            [1, 512, 2, 2]            1,179,648
│    │    └─BatchNorm2d: 3-40            [1, 512, 2, 2]            [1, 512, 2, 2]            1,024
│    │    └─ReLU: 3-41                   [1, 512, 2, 2]            [1, 512, 2, 2]            --
│    │    └─Conv2d: 3-42                 [1, 512, 2, 2]            [1, 512, 2, 2]            2,359,296
│    │    └─BatchNorm2d: 3-43            [1, 512, 2, 2]            [1, 512, 2, 2]            1,024
│    │    └─Sequential: 3-44             [1, 256, 3, 3]            [1, 512, 2, 2]            132,096
│    │    └─ReLU: 3-45                   [1, 512, 2, 2]            [1, 512, 2, 2]            --
│    └─BasicBlock: 2-8                   [1, 512, 2, 2]            [1, 512, 2, 2]            --
│    │    └─Conv2d: 3-46                 [1, 512, 2, 2]            [1, 512, 2, 2]            2,359,296
│    │    └─BatchNorm2d: 3-47            [1, 512, 2, 2]            [1, 512, 2, 2]            1,024
│    │    └─ReLU: 3-48                   [1, 512, 2, 2]            [1, 512, 2, 2]            --
│    │    └─Conv2d: 3-49                 [1, 512, 2, 2]            [1, 512, 2, 2]            2,359,296
│    │    └─BatchNorm2d: 3-50            [1, 512, 2, 2]            [1, 512, 2, 2]            1,024
│    │    └─ReLU: 3-51                   [1, 512, 2, 2]            [1, 512, 2, 2]            --
├─AdaptiveAvgPool2d: 1-9                 [1, 512, 2, 2]            [1, 512, 1, 1]            --
├─Linear: 1-10                           [1, 512]                  [1, 1000]                 513,000
===================================================================================================================
Total params: 11,689,512
Trainable params: 11,689,512
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 81.05
===================================================================================================================
Input size (MB): 0.02
Forward/backward pass size (MB): 1.28
Params size (MB): 46.76
Estimated Total Size (MB): 48.05
===================================================================================================================
```

## 2. Training ResNet on resized images

In [None]:
train_transform_224 = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation((-7,7)),
    transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)),
    transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261)),
])
test_transform_224 = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize((0.491, 0.482, 0.446), (0.247, 0.243, 0.261)),
])

train_dataset_224 = CustomDataset(train_data, train_labels, transform=train_transform_224)
test_dataset_224 = CustomDataset(test_data, test_labels, transform=test_transform_224)

train_loader_224 = DataLoader(train_dataset_224, batch_size=BATCH_SIZE, shuffle=True)
test_loader_224 = DataLoader(test_dataset_224, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model_224 = resnet18(num_classes=10)
train_model(model_224, train_loader_224, test_loader_224, 'scratch_224')

In [None]:
pretrained_model_224 = resnet18(weights='DEFAULT')
pretrained_model_224.fc = nn.Linear(512, 10)
train_model(pretrained_model_224, train_loader_224, test_loader_224, 'pretrained_224')

<img src="train_loss_2.png" alt="Train Loss" width="450"/>
<img src="train_acc_2.png" alt="Train Acc" width="450"/>
<img src="test_acc_2.png" alt="Test Acc" width="450"/>
<img src="f1_score_2.png" alt="F1 Score" width="450"/>
<img src="confusion_matrix_2.png" alt="Confusion Matrix" width="600"/>

**Qn. Better accuracy may come at cost. What changed/degraded from the previous set up?**

**Answer:**

- An input of large spatial dimension leads to larger activation maps across the network.

- This makes the forward and backward passes slower, since the time complexity of convolution is proportional to the input dimension.

- In our experimentation, we found the model training process to be ~5 times slower.

- We can expect similar (but not exactly the same) slowdown during inference.

## 3. Modifying the architecture of ResNet18 to suit the given dataset

### 3.1 Modifying ResNet18

In [None]:
modified_model_1 = resnet18(num_classes=10)
modified_model_1.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False)
train_model(modified_model_1, train_loader, test_loader, 'modified_scratch_1')

In [None]:
modified_model_2 = resnet18(num_classes=10)
modified_model_2.conv1 = nn.Conv2d(3, 64, kernel_size=5, stride=1, padding=2, bias=False)
train_model(modified_model_2, train_loader, test_loader, 'modified_scratch_2')

In [None]:
modified_model_3 = resnet18(num_classes=10)
modified_model_3.maxpool = nn.Identity()
train_model(modified_model_3, train_loader, test_loader, 'modified_scratch_3')

<img src="train_loss_3a.png" alt="Train Loss" width="450"/>
<img src="train_acc_3a.png" alt="Train Acc" width="450"/>
<img src="test_acc_3a.png" alt="Test Acc" width="450"/>
<img src="f1_score_3a.png" alt="F1 Score" width="450"/>
<img src="confusion_matrix_3a.png" alt="Confusion Matrix" width="600"/>

### 3.2 Modifying Pretrained ResNet18

In [None]:
pretrained_model_1 = resnet18(weights='DEFAULT')
pretrained_model_1.fc = nn.Linear(512, 10)
pretrained_model_1.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False)
train_model(pretrained_model_1, train_loader, test_loader, 'modified_pretrained_1')

In [None]:
pretrained_model_2 = resnet18(weights='DEFAULT')
pretrained_model_2.fc = nn.Linear(512, 10)
pretrained_model_2.conv1 = nn.Conv2d(3, 64, kernel_size=5, stride=1, padding=2, bias=False)
train_model(pretrained_model_2, train_loader, test_loader, 'modified_pretrained_2')

In [None]:
pretrained_model_3 = resnet18(weights='DEFAULT')
pretrained_model_3.fc = nn.Linear(512, 10)
pretrained_model_3.maxpool = nn.Identity()
train_model(pretrained_model_3, train_loader, test_loader, 'modified_pretrained_3')

<img src="train_loss_3b.png" alt="Train Loss" width="450"/>
<img src="train_acc_3b.png" alt="Train Acc" width="450"/>
<img src="test_acc_3b.png" alt="Test Acc" width="450"/>
<img src="f1_score_3b.png" alt="F1 Score" width="450"/>
<img src="confusion_matrix_3b.png" alt="Confusion Matrix" width="600"/>

**Qn. In the case of the pretrained model, the first layer needs to be initialized from scratch, while the other layers have weights of the pretrained model. Would such an initialization (with different distributions in different layers) be a problem and make the model learn worse, or does it not affect the training significantly?**

**Answer:**

- We start by observing that the modified pretrained models give better results than the pretrained models.

- It has been noted that training on natural images tends to produce task-agnostic features (see https://arxiv.org/abs/1411.1792).

- Learning such general-purpose features from scratch isn't difficult on re-initialization.

- Presence of learned weights in other layers only help stabilize the learning process.

## 4. Comparison

**Qn. Compare all the different aspects of the trained models. Draw comparisions between pretrained versus non-pretrained, effects of the size of the image, the kernel size, etc.**

**Answer:**

- Pretraining: The pretrained model achieves better results. Exposure to a much more diverse ImageNet dataset must have improved its generalization capability.

- Image Size: The best metrics were obtained for models trained on $(224, 224)$ images. As pointed out in the question, the model is better suited for this dimension. On the other hand, downsampling to $(9, 9)$ in just two layers can lead to a significant loss of information. Representing objects like truck, ship, etc. may not be best suited for such a small dimension learned via a couple of transformations.

- Kernel Size: Conflicting results were obtained for the randomly initialized and pretrained models, about the use of $(5, 5)$ or $(7, 7)$ kernel.

- Kernel Stride: The default value of $stride = 2$ leads to downsampling from $(36, 36)$ to $(18, 18)$. An improvement in metrics was observed for both the models on changing the $stride$ to 1.

- Pooling: The maxpool layer in the early stage of the model leads to downsampling from $(18, 18)$ to $(9, 9)$. Since we would like to have a higher dimension representation, replacing the maxpool layer with an identity function would be suitable. This results in an improvement in metrics for both the models.

**Qn. Additionally, look at the F1 score and confusion matrices as accuracy is not always the perfect measure.**

**Answer:**

- See the respective sections or this [sheet](wandb_export.csv) for the metrics.

- No case was found in this experiment, where a model with higher accuracy had a lower F1 score.

**Qn. Explain why you think those differences arise.**

**Answer:**

- The difference between accuracy and F1 score commonly arises because of skew in distribution. Since each loss or error is given equal weightage regardless of its type, this can create large gaps in line with the skew in data.

- For example, when the negative samples outnumber the positives (sparsity), the baseline accuracy for always predicting negative increases. The model can thus learn to make skewed predictions to maximize its accuracy over skewed datasets.

- A counter example would be where model predicts the positive and negative class with equal probability. Indeed, when $TN = TP$, the F1 score is equal to the accuracy.