<a href="https://colab.research.google.com/github/kekubhai/VIT-01/blob/main/VIt_model_onCFIR_10_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets,transforms
import numpy as np
import random
import matplotlib.pyplot as plt


In [3]:
#setting up device agnostic code
device='cuda' if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [4]:
torchvision.__version__

'0.21.0+cu124'

In [5]:
torch.cuda.is_available()

False

In [6]:
#Set the seed
torch.manual_seed(42)
random.seed(42)

In [16]:
#setting the hyperparameters
BATCH_SIZE=128
EPOCHS=10
LEARNING_RATE=3e-4
PATCH_SIZE=4
NUM_CLASSES=10
IMAGE_SIZE=32
CHANNELS=3
EMBED_DIM=256
NUM_HEADS=8
DEPTH=6
MLP_DIM=512
DROP_RATE=0.1


Defining the transformers


In [8]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5),(0.6))

])

In [9]:
#Getting a dataset
train_dataset=datasets.CIFAR10(root='./data',train=True,download=True,transform=transform)
test_dataset=datasets.CIFAR10(root='./data',train=False,download=True,transform=transform)


100%|██████████| 170M/170M [00:02<00:00, 76.0MB/s]


In [10]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.6)
           )

In [11]:
len(train_dataset)

50000

In [12]:
## converting datasets into data loaders
train_loader=DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE,
                        shuffle=False)
test_loader=DataLoader(dataset=test_dataset,
                       batch_size=BATCH_SIZE,
                       shuffle=False)


In [13]:
#Let's check out what we've created
print(f"DataLoader: {train_loader, test_loader}")
print(f"Length of train loader : {len(train_loader) } batch")

DataLoader: (<torch.utils.data.dataloader.DataLoader object at 0x793c00a62a90>, <torch.utils.data.dataloader.DataLoader object at 0x793c00a63a10>)
Length of train loader : 391 batch


In [18]:
#building the vision transformer from scratch
PATCH_SIZE


4

In [23]:
class PatchEmbedding(nn.Module):
       def __init__(self,
                    img_size,
                    patch_size,
                    in_channels,
                    embed_dim):
         super().__init__()
         self.patch_size=patch_size
         self.proj=nn.Conv2(in_channels=in_channels,
                            out_channels=embed_dim,
                            kernel_size=patch_size,
                            stride=patch_size)
         num_patches=(img_size//patch_size)**2
         self.cls_token=nn.Parameter(torch.randn(1,1,embed_dim))
         self.pos_embedding=nn.Parameter(torch.randn(1,1+num_patches,embed_dim))
       def forward(self,x:torch.Tensor):
         B=x.size(0)
         x=self.proj(x)
         x=x.flatten(2)
         x=x.transpose(1,2)
         cls_token=self.cls_token(B,-1,-1)
         x=torch.cat((cls_token,x),dim=1)
         x=x+self.pos_embedding
         return x

In [25]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self,embed_dim, num_heads,mlp_dim,drop_rate):
    super().__init__()
    self.norm1=nn.LayerNorm(embed_dim)
    self.attn=nn.MultiheadAttention(embed_dim,num_heads,dropout=drop_rate,batch_first=True)
    self.norm2=nn.LayerNorm(embed_dim)
    self.mlp=MLP(embed_dim,mlp_dim,drop_rate)
  def forward(self,x):
    x=x+self.attn(self.norm1(x),self.norm1(x),self.norm1(x))[0]
    x=x+self.mlp(self.norm2(x))
    return x
