In [2]:
# Dataset: CIFAR10

# Architecture: simplified Vision Transformer (ViT) to classify images
# Loss: CrossEntropyLoss
# Optimizer: Adam (lr=0.001)

# Define the batch size for training and testing
batch_size = 64
learning_rate = 0.001

from tqdm import tqdm

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define a transform to convert images to tensors and normalize them
transform = transforms.Compose([
transforms.ToTensor(), # Convert PIL image to tensor
transforms.Normalize((0.4914, 0.4822, 0.4465), # Mean for each channel
                    (0.2470, 0.2435, 0.2616)) # Std for each channel
])

# Load the CIFAR-10 training dataset with transformations applied
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
# Load the CIFAR-10 test dataset with the same transformations
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)
# Create a data loader for the training set
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, # Number of samples per batch
                        shuffle=True) # Shuffle the data each epoch
# Create a data loader for the test set
test_loader = DataLoader(dataset=test_dataset,
                        batch_size=batch_size, # Same batch size as training
                        shuffle=False) # No shuffling for test data

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class PatchEmbedding(nn.Module):
    """
    Splits the image into patches and embeds them.
    """
    def __init__(self, in_channels=3, patch_size=4, emb_size=128, img_size=32):
        super().__init__()
        self.patch_size = patch_size
        # We use a simple conv layer to perform patchify + embedding in one step.
        self.proj = nn.Conv2d(in_channels, emb_size,
        kernel_size=patch_size,
        stride=patch_size)
        # Number of patches
        num_patches = (img_size // patch_size) * (img_size // patch_size)
        # Class token
        self.cls_token = nn.Parameter(torch.zeros(1, 1, emb_size))
        # Positional embedding
        self.pos_emb = nn.Parameter(torch.zeros(1, num_patches + 1, emb_size))

    def forward(self, x):
        """
        x shape: (B, 3, 32, 32)
        returns: (B, N+1, emb_size)
        """
        B = x.size(0)
        # Conv2d -> (B, emb_size, H’, W’), with H’ and W’ = 32 // patch_size
        x = self.proj(x) # (B, emb_size, H’, W’)
        x = x.flatten(2) # (B, emb_size, H’*W’)
        x = x.transpose(1, 2) # (B, H’*W’, emb_size)
        # Class token
        cls_token = self.cls_token.expand(B, -1, -1) # (B, 1, emb_size)
        x = torch.cat([cls_token, x], dim=1) # (B, N+1, emb_size)
        # Add positional embedding
        x = x + self.pos_emb[:, : x.size(1), :]
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, emb_size=128, num_heads=4, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads
        self.qkv = nn.Linear(emb_size, 3 * emb_size)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(emb_size, emb_size)

    def forward(self, x):
        # x shape: (B, N, emb_size)
        B, N, _ = x.shape
        qkv = self.qkv(x) # (B, N, 3*emb_size)
        qkv = qkv.reshape(B, N, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, num_heads, N, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2] # each: (B, num_heads, N, head_dim)
        # Scaled Dot-Product Attention
        # scores shape: (B, num_heads, N, N)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        att = torch.softmax(scores, dim=-1)
        att = self.att_drop(att)
        # out shape: (B, num_heads, N, head_dim)
        out = torch.matmul(att, v)
        # Combine heads
        out = out.transpose(1, 2) # (B, N, num_heads, head_dim)
        out = out.flatten(2) # (B, N, emb_size)
        out = self.projection(out)
        return out

class TransformerEncoderBlock(nn.Module):
    def __init__(self, emb_size=128, num_heads=4, expansion=4, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(emb_size)
        self.attn = MultiHeadSelfAttention(emb_size, num_heads, dropout)
        self.norm2 = nn.LayerNorm(emb_size)

        # Feed-forward network
        self.ffn = nn.Sequential(
        nn.Linear(emb_size, expansion * emb_size),
        nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(expansion * emb_size, emb_size)
        )
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # Attention block
        x_res = x
        x = self.norm1(x)
        x = self.attn(x)
        x = x_res + self.drop(x)
        # Feed-forward block
        x_res = x
        x = self.norm2(x)
        x = self.ffn(x)
        x = x_res + self.drop(x)
        return x

class VisionTransformer(nn.Module):
    def __init__(self,
        in_channels=3,
        patch_size=4,
        emb_size=128,
        img_size=32,
        num_heads=4,
        num_layers=6,
        num_classes=10,
        dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(in_channels, patch_size, emb_size,
        img_size)
        self.encoder = nn.Sequential(*[
        TransformerEncoderBlock(
        emb_size=emb_size,
        num_heads=num_heads,
        expansion=4,
        dropout=dropout
        ) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(emb_size)
        self.cls_head = nn.Linear(emb_size, num_classes)

    def forward(self, x):
        # x shape: (B, 3, 32, 32)
        x = self.patch_embed(x) # (B, N+1, emb_size)
        x = self.encoder(x) # (B, N+1, emb_size)
        x = self.norm(x) # (B, N+1, emb_size)aa
        # The first token is the class token
        cls_token_final = x[:, 0]
        out = self.cls_head(cls_token_final) # (B, num_classes)
        return out

In [None]:
# Implement a ViT with default config. 
model = VisionTransformer()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train model for 20 epochs
num_epochs = 20

# plot training loss over epochs
# plot training and testing accuracy over epochs
# compare with CNN models and discuss observations:
    # which model performs better?
    # which model converges faster?


In [None]:
# 2.
    # train and evaluate with different hyperparameters:

# a) batch size = 64, learning rate = [0.01, 0.001, 0.0001]
    # plot training loss, training accuracya, and testing accuracy over epochs
    # discuss observations:
        # which learning rate performs better?
        # which learning rate converges faster?
# b) use the best learning rate found. Change optimizer to RMSProp.
    # plot training loss, training accuracy, and testing accuracy over epochs
    # discuss observations between Adam and RMSProp optimizers. 


In [None]:
# 3. 
    # Investigate the effect of different model designs. Change num_layers to 4 and 8. 
    # plot training loss, training accuracy, and testing accuracy over epochs
    # discuss how number of Transformer layers affects performance and convergence. 
