## **QUESTION1**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchsummary import summary


class SelfAttention(nn.Module):
    def __init__(self, in_dim):
        super(SelfAttention, self).__init__()
        # Assuming kernel_size=1, we manually initialize weights and biases
        self.query_weight = nn.Parameter(torch.randn(in_dim//8, in_dim))
        self.key_weight = nn.Parameter(torch.randn(in_dim//8, in_dim))
        self.value_weight = nn.Parameter(torch.randn(in_dim, in_dim))

        self.query_bias = nn.Parameter(torch.zeros(in_dim//8))
        self.key_bias = nn.Parameter(torch.zeros(in_dim//8))
        self.value_bias = nn.Parameter(torch.zeros(in_dim))

        self.gamma = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        batch_size, channels, height, width = x.size()

        # Reshape x for manual convolution (point-wise linear transformation)
        x_reshaped = x.view(batch_size, channels, -1) # BxCx(N)

        # Manual Convolution
        proj_query = (torch.matmul(self.query_weight, x_reshaped) + self.query_bias.unsqueeze(-1)).permute(0, 2, 1)
        proj_key = torch.matmul(self.key_weight, x_reshaped) + self.key_bias.unsqueeze(-1)
        proj_value = torch.matmul(self.value_weight, x_reshaped) + self.value_bias.unsqueeze(-1)

        # Attention mechanism
        energy = torch.bmm(proj_query, proj_key)
        attention = F.softmax(energy, dim=-1)

        out = torch.bmm(proj_value, attention.permute(0, 2, 1))
        out = out.view(batch_size, channels, height, width)

        # Apply residual connection
        out = self.gamma * out + x
        return out


class CNNWithSelfAttention(nn.Module):
    def __init__(self):
        super(CNNWithSelfAttention, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.sa1 = SelfAttention(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.sa2 = SelfAttention(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.sa3 = SelfAttention(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.sa4 = SelfAttention(256)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.sa1(x)
        x = F.relu(self.conv2(x))
        x = self.sa2(x)
        x = F.relu(self.conv3(x))
        x = self.sa3(x)
        x = F.relu(self.conv4(x))
        x = self.sa4(x)
        x = F.relu(self.conv5(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = CNNWithSelfAttention()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# Testing loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 28943406.14it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch [1/10], Loss: 1.6798
Epoch [2/10], Loss: 1.2673
Epoch [3/10], Loss: 1.0800
Epoch [4/10], Loss: 0.9500
Epoch [5/10], Loss: 0.8686
Epoch [6/10], Loss: 0.7875
Epoch [7/10], Loss: 0.7330
Epoch [8/10], Loss: 0.6966
Epoch [9/10], Loss: 0.6570
Epoch [10/10], Loss: 0.6372
Test Accuracy: 0.7317


In [None]:
summary(model, (3, 32, 32))

## **QUESTION2:**

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import math
from torchsummary import summary

class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super(PatchEmbedding, self).__init__()
        self.image_size = image_size
        self.patch_size = patch_size
        self.n_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == W == self.image_size, "Input image size does not match the model."
        x = self.proj(x).flatten(2).transpose(1, 2) # B, embed_dim, n_patches
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by number of heads"

        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

        self.fc_out = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value, mask=None):
        B = query.shape[0]

        query = self.query(query).view(B, -1, self.num_heads, self.head_dim).transpose(1, 2) # B, num_heads, n_query, head_dim
        key = self.key(key).view(B, -1, self.num_heads, self.head_dim).transpose(1, 2) # B, num_heads, n_key, head_dim
        value = self.value(value).view(B, -1, self.num_heads, self.head_dim).transpose(1, 2) # B, num_heads, n_value, head_dim

        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim) # B, num_heads, n_query, n_key
        if mask is not None:
            attention_scores += mask.unsqueeze(1)

        attention_weights = F.softmax(attention_scores, dim=-1) # B, num_heads, n_query, n_key
        out = torch.matmul(attention_weights, value) # B, num_heads, n_query, head_dim

        out = out.transpose(1, 2).contiguous().view(B, -1, self.embed_dim) # B, n_query, embed_dim
        out = self.fc_out(out)
        return out

class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super(TransformerEncoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_out = self.attention(x, x, x)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        mlp_out = self.mlp(x)
        x = x + self.dropout(mlp_out)
        x = self.norm2(x)
        return x

class VisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, num_classes, embed_dim=768, num_heads=12, mlp_dim=3072, num_layers=12, dropout=0.1):
        super(VisionTransformer, self).__init__()
        self.patch_embedding = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + self.patch_embedding.n_patches, embed_dim))
        self.transformer_encoder_blocks = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, mlp_dim, dropout) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embedding(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embedding
        x = self.dropout(x)

        for transformer_encoder_block in self.transformer_encoder_blocks:
            x = transformer_encoder_block(x)

        x = self.norm(x[:, 0]) # take only the class token
        x = self.fc(x)
        return x

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = VisionTransformer(image_size=32, patch_size=4, in_channels=3, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate test accuracy
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Files already downloaded and verified
Files already downloaded and verified
Epoch [1/10], Loss: 2.3650
Epoch [2/10], Loss: 2.3126
Epoch [3/10], Loss: 2.3066
Epoch [4/10], Loss: 2.3049
Epoch [5/10], Loss: 2.3045
Epoch [6/10], Loss: 2.3039
Epoch [7/10], Loss: 2.3039
Epoch [8/10], Loss: 2.3038
Epoch [9/10], Loss: 2.3038
Epoch [10/10], Loss: 2.3040
Test Accuracy: 0.1000


In [None]:
summary(model, (3, 32, 32))