# Colab Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Change directory to where this file is located
"""
%cd "/content/drive/MyDrive"r

[Errno 2] No such file or directory: '/content/drive/MyDriver'
/content


# Import Modules

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from torchvision import transforms, datasets

from tqdm.auto import tqdm

# Utils

In [None]:
def train(model, train_loader, optimizer, criterion,DEVICE):
    """
    Trains the model with training data.

    Do NOT modify this function.
    """
    model.train()
    tqdm_bar = tqdm(train_loader)
    for batch_idx, (image, label) in enumerate(tqdm_bar):
        image = image.to(DEVICE)
        label = label.to(DEVICE)
        optimizer.zero_grad()
        output = model(image)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        tqdm_bar.set_description("Epoch {} - train loss: {:.6f}".format(epoch, loss.item()))


def evaluate(model, test_loader, criterion,DEVICE):
    """
    Evaluates the trained model with test data.

    Do NOT modify this function.
    """
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for image, label in tqdm(test_loader):
            image = image.to(DEVICE)
            label = label.to(DEVICE)
            output = model(image)
            test_loss += criterion(output, label).item()
            prediction = output.max(1, keepdim=True)[1]
            correct += prediction.eq(label.view_as(prediction)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, test_accuracy

# ViT Model

In [None]:
###
# Always check tensor shapes!
# Printing shapes can be the fastest way to track the error and fix it.
# Using dropout is optional. You don't have to use pre-declared dropouts.
###

In [None]:
class Patchification(nn.Module):
  """
  Question (a)
  Process the batch of images to non-overlapping patches using convolution layer.
  You are only allowed to use torch.nn.Con2d

  - Input shape: [batch, channel, height, width]
  - Return: [batch, number_of_patches, embedding_dimension]
  """
  def __init__(self, in_channels, patch_size, embedding_dim):
    super().__init__()
    """
    Hint: embedding_dim should be the out_channel of convolution.
    """
    ##### YOUR CODE #####
    self.patch_size = patch_size
    self.embedding_dim = embedding_dim
    self.in_channels = in_channels
    self.conv = nn.Conv2d(in_channels, embedding_dim, kernel_size=patch_size, stride=patch_size)
    #####################

  def forward(self, x):
    ##### YOUR CODE #####
    x = self.conv(x)
    x = x.view(x.size(0), self.embedding_dim, -1).transpose(1, 2)
    #####################
    return x

In [None]:
"""
Hint cell.
You can briefly check if the function is implemented correctly by printing the shape of the output before jumping into the training with GPU.
Example code is written in the below.
"""
batch, channel, height, width = 5, 3, 32, 32
patch_size = (4, 4)
embedding_dim = 128

test_img = torch.ones(batch, channel, height, width)

patchification = Patchification(channel, patch_size, embedding_dim)
after_patchification = patchification(test_img)

print(after_patchification.shape) # Is the shape [batch, number_of_patches, dim] ?

torch.Size([5, 64, 128])


In [None]:
class MLP(nn.Module):
  """
  Feed-forward layer

  - Input shape: [batch, number_of_patches, embedding_dimension]
  - Return: [batch, number_of_patches, embedding_dimension]

  Do NOT modify.
  """
  def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
  def forward(self, x):
      x = self.layer(x)
      return x

In [None]:
class Attention(nn.Module):
  """
  Question (b)

  Implement Multi-head attention class.
  Please refer to part 3.2 in the paper "Attention is All You Need"
  You can implement without considering heads (i.e. vanilla attention).
  However, your maximum score will be 5 points.

  - Input shape: [batch, number_of_patches, embedding_dimension]
  - Return: [batch, number_of_patches, embedding_dimension]
  """
  def __init__(self, dim, num_heads, dropout = 0.):
    super().__init__()
    """
    Do NOT modify.
    """
    self.head_dim = dim // num_heads
    self.dim = dim
    self.num_heads = num_heads

    self.scale = self.head_dim ** 0.5 # Don't forget scaling!
    # If you are not going to consider head, you need to change self.scale as below.
    # self.scale = self.dim ** 0.5

    self.dropout = nn.Dropout(dropout)

    ##### YOUR CODE #####
    """
    You need to define some layers...
    """
    self.dropout = nn.Dropout(dropout)
    self.qkv = nn.Linear(dim, dim * 3, bias=False) # calculate Query, Key, Value all at once.
    self.fc_out = nn.Linear(dim, dim)

    #####################

  def forward(self, x):
    ##### YOUR CODE #####
    B, N, dim = x.size()
    qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) #(3, B, num_head, N, head_dim)
    q, k, v = qkv[0], qkv[1], qkv[2]   # split query, key, value # (B, num_head, N, head_dim)

    # Attention mechanism, which is similar to hw4_seq2seq
    attn_score = (q @ k.transpose(-2, -1))/self.scale # refered to Lab4
    attn_prob = nn.functional.softmax(attn_score, dim=-1)
    attn_out = attn_prob @ v
    attn_out = attn_out.transpose(1, 2).reshape(B, N, dim)
    x = self.fc_out(attn_out)

    #####################
    return x

class Block(nn.Module):
  """
  Question (c)
  Complete Block class.

  - Input shape: [batch, number_of_patches, embedding_dimension]
  - Return: [batch, number_of_patches, embedding_dimension]
  """
  def __init__(self, dim, num_heads, mlp_dim, dropout=0.):
    super().__init__()

    self.LN1 = nn.LayerNorm(dim)
    self.LN2 = nn.LayerNorm(dim)
    ##### YOUR CODE #####
    self.attn = Attention(dim, num_heads, dropout)
    self.mlp = MLP(dim, mlp_dim,dropout)

    #####################

  def forward(self, x):
    """
    Hint: Don't forget the residual connections!
    """
    ##### YOUR CODE #####
    attn_out = self.attn(self.LN1(x))
    x = x + attn_out
    mlp_out = self.mlp(self.LN2(x))
    x = x + mlp_out

    #####################
    return x

In [None]:
class ViT(nn.Module):
    def __init__(self, image_shape, patch_size, num_classes, dim, num_heads, depth, mlp_dim, dropout = 0.):
        super().__init__()
        """
        image_shape: [channel, height, width]
        patch_size: [height, width]
        dim: Embedding dimension
        num_heads: Number of heads to be used in Multi-head Attention
        depth: Number of attention blocks to be used
        mlp_dim: Hidden dimension to be used in MLP layer (=feedforward layer)
        """

        image_ch, image_h, image_w = image_shape # image_ch will be 3(RGB 3 channels) for CIFAR10 dataset
        patch_h, patch_w = patch_size

        assert image_h % patch_h == 0 and image_w % patch_w == 0, 'Image height & width must be divisible by those of patch respectively.'
        assert dim % num_heads == 0, 'Embedding dimension should be divisible by number of heads.'
        num_patches = (image_h // patch_h) * (image_w // patch_w) # e.g. [32 x 32] image & [8 x 8] patch size -> [4 x 4 = 16] patches

        ##### YOUR CODE #####
        """
        Define Patchification using convolution.
        """
        self.patchify = Patchification(image_ch, patch_size, dim)

        """
        Define Learnable positional encoding, 1+ is for class token.
           Hint: use nn.Parameter
        """
        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + num_patches, dim))

        """
        Define Class token which will be prepended to each image.
           Hint: use nn.Parameter
        """
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        #####################


        # Initialize attention blocks
        self.attention_blocks = nn.ModuleList([
            Block(dim, num_heads, mlp_dim, dropout)
            for _ in range(depth)
        ])

        # Classification head, maps the final vector to class dimension.
        self.classification_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        """
        Question (d)

        ViT forward process

        [Hint]
        - After patchification, shape will be [batch, number_of_patches, dim].
        - If you successfully prepend cls_tokens to this batch of patchfied images, shape will be [batch, 1+ number_of_patches, dim].
        - Then simply add the positional embedding.
        - Now the tokens(patches) are ready to go through the attention blocks.
        - After attention operation, classify with class token. (Simply take off it from whole tokens)
        """
        ##### YOUR CODE #####
        cls_tokens =  self.cls_token.expand(img.shape[0], -1, -1) # (batch, 1, dim)

        x = self.patchify(img)  # (batch, number_of_patches, dim)

        x = torch.cat([cls_tokens, x], dim=1)  # (batch, 1+number_of_patches, dim)
        x = x + self.pos_embedding[:, :(1 + x.size(1))]

        for blk in self.attention_blocks:
            x = blk(x)

        x = self.classification_head(x[:, 0])
        #####################
        return x

# ViT Image Classification

In [None]:
"""
Make sure your runtime type is GPU and you are using PyTorch version higher than 1.8!

Do NOT modify.
"""
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using PyTorch version: {}, Device: {}".format(torch.__version__, DEVICE))

Using PyTorch version: 2.3.0+cu121, Device: cuda


In [None]:
"""
You may change some settings including batch size & augmentations.
But if your implementation is correct, default setting is enough to achieve the target performance(i.e. 65%).
"""
BATCH_SIZE = 100

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Prepare Dataset & DataLoader
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
"""
Question (e)
Train your ViT to achieve 65% of accuarcy.
Feel free to change the hyperparameters.
But again, if your implementation is correct, default setting is enough to achieve the target performance(i.e. 65%).
"""
# Hyperparameters
EPOCHS = 10
patch_size = (4,4)
dim = 128
depth = 8
num_heads = 8
mlp_dim = 256
dropout = 0.
learning_rate = 0.0002 # slightly change the learning rate.

model = ViT(image_shape = (3,32,32), patch_size = patch_size, num_classes = 10, dim = dim, num_heads = num_heads, depth = depth, mlp_dim = mlp_dim, dropout=dropout).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
"""
Do NOT modify.
It will take less than 8 minutes for training with default setting.
"""
for epoch in range(1, EPOCHS + 1):
    train(model, trainloader, optimizer, criterion,DEVICE)
    test_loss, test_accuracy = evaluate(model, testloader, criterion,DEVICE)
    print("\n[EPOCH: {}], \tModel: ViT, \tTest Loss: {:.4f}, \tTest Accuracy: {:.2f} % \n".format(
        epoch, test_loss, test_accuracy))

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 1], 	Model: ViT, 	Test Loss: 0.0140, 	Test Accuracy: 50.05 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 2], 	Model: ViT, 	Test Loss: 0.0124, 	Test Accuracy: 55.44 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 3], 	Model: ViT, 	Test Loss: 0.0115, 	Test Accuracy: 58.86 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 4], 	Model: ViT, 	Test Loss: 0.0108, 	Test Accuracy: 61.20 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 5], 	Model: ViT, 	Test Loss: 0.0105, 	Test Accuracy: 62.60 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 6], 	Model: ViT, 	Test Loss: 0.0103, 	Test Accuracy: 63.19 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 7], 	Model: ViT, 	Test Loss: 0.0099, 	Test Accuracy: 64.92 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 8], 	Model: ViT, 	Test Loss: 0.0097, 	Test Accuracy: 65.87 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 9], 	Model: ViT, 	Test Loss: 0.0096, 	Test Accuracy: 66.32 % 



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]


[EPOCH: 10], 	Model: ViT, 	Test Loss: 0.0095, 	Test Accuracy: 66.94 % 

