In [None]:
! pip install torch torchvision
! pip install opencv-contrib-python
! pip install scikit-learn
! pip install matplotlib
! pip install vit-pytorch
! pip install timm



# Reimplementation of Vision Transformer


In [None]:
import torch
import torch.nn as nn

## Image splitter using convolutional layer

In [None]:

"""First step in vision transformer - image splitter"""
class PatchEmbed(nn.Module):
    def __init__(self, in_channels, patch_size, image_size, embed = 69):
        super().__init__()
        self.channels = in_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2)
        x = x.transpose(1, 2)
        return x

    '''explanation of dimensions'''
    #the output of the projection layer will be a tensor of sizes of (number_of_sample by embed_dimensions by square root of number_of_patches by square root of number_of_patches)
    # flatten to (n_sample, embed, num_patches)
    # transpose to (n_sample, num_patches, embed)

## Attention mechanism

In [None]:
class Attention(nn.Module):
    def __init__(self, emb_dim, n_heads=12, qkv_bias=True, attn_p=0, proj_p=0):
        super().__init__()
        self.n_heads = n_heads
        self.emb_dim = emb_dim
        self.head_dim = emb_dim//n_heads
        self.scale = self.head_dim**-0.5 # this is here to resolve the gradient problem with using softmax

        self.key = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.query = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.value = nn.Linear(emb_dim, emb_dim, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(emb_dim, emb_dim)
        self.proj_drop = nn.Dropout(proj_p)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
      '''X is a tensor of n_samples, n_patches+1, emb_dim'''
      Batches, n_patches, emb_dim = x.shape
      if emb_dim != self.emb_dim:
        raise ValueError(f"emb_dim must be {self.emb_dim}, but is {emb_dim}")

      query = self.query(x)
      Key = self.key(x)
      value = self.value(x)
      '''(n_samples, n_patches, emb_dim) '''
      query = query.reshape(Batches, n_patches, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
      Key = Key.reshape(Batches, n_patches, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
      value = value.reshape(Batches, n_patches, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

      key_t = Key.transpose(-2, -1)
      distance = (query @ key_t) * self.scale # this stems from the original paper on transformer
      attention = distance.softmax(dim=-1)
      attention = self.attn_drop(attention)
      x = (attention @ value).transpose(1, 2).reshape(Batches, n_patches, self.emb_dim)
      x = self.proj(x)
      x = self.proj_drop(x)
      return x

# MLP implemented class
class mlp(nn.Module):
  def __init__(self, input, hidden_dim, out, p=0):
    super().__init__()
    self.fc1 = nn.Linear(input, hidden_dim)
    self.act = nn.GELU()
    self.fc2 = nn.Linear(hidden_dim, out)
    self.drop = nn.Dropout(p)

  def forward(self, x):
    x = self.fc1(x)
    x = self.act(x)
    x = self.drop(x)
    x = self.fc2(x)
    x = self.drop(x)
    return x

class transformer_block(nn.Module):
  def __init__(self, emb_dim, n_heads, mlp_ratio=3.0, qkv_bias= True, p=0., attn_p=0):
    super().__init__()
    self.norm1 = nn.LayerNorm(emb_dim, eps=1e-6)
    self.attn = Attention(emb_dim,  n_heads=n_heads, qkv_bias=qkv_bias, attn_p=attn_p, proj_p=p)
    hidden_layers = int(emb_dim * mlp_ratio)
    self.mlp = mlp(emb_dim, hidden_layers, emb_dim)
    self.norm2= nn.LayerNorm(emb_dim, eps =1e-6)

  def forward(self, x):
    # Attention
    h = x
    x = self.norm1(x)
    x = self.attn(x)
    x = x + h

    # MLP
    h=x
    x = self.norm2(x)
    x = self.mlp(x)
    x= x+h
    return x



class Vit_transformer(nn.Module):
  def __init__( self, img_size= 32, patch_size=16, in_channels = 3, n_classes = 10 , embeded_dim = 768, depth=12, n_heads=12, mlp_ratio= 4.0, qkv_bias=True, p=0., attn_p=0.,):
      super().__init__()
      self.patch_embed = PatchEmbed(in_channels, patch_size, img_size, embeded_dim)
      self.cls_token = nn.Parameter(torch.zeros(1, 1, embeded_dim))
      self.pos_embed = nn.Parameter(torch.zeros(1, 1 + self.patch_embed.num_patches, embeded_dim))
      self.pos_drop=nn.Dropout(p=p)
      self.blocks = nn.ModuleList([transformer_block(embeded_dim, n_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,p=p, attn_p=attn_p) for _ in range(depth)])
      self.norm = nn.LayerNorm(embeded_dim, eps=1e-6)
      self.head = nn.Linear(embeded_dim, n_classes)

  def forward(self, x):
    n_samples = x.shape[0]
    x = self.patch_embed(x)
    cls_tokens = self.cls_token.expand(n_samples, -1, -1)
    x = torch.cat((cls_tokens, x), dim=1)
    x = x + self.pos_embed
    x = self.pos_drop(x)
    for block in self.blocks:
      x = block(x)
    x = self.norm(x)
    cls_token_final = x[:, 0]
    x = self.head(cls_token_final)
    return x

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
#from torchvision.datasets import CIFAR10
from torchvision.transforms import Compose, ToTensor, Resize
from torchvision import transforms
from torchvision import datasets
import os
import tarfile
from shutil import copyfile


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
img_size = 32

In [None]:
# Formatting the initial data
def creating_dataset():
    images_path = "/home/lananh/Desktop/vision_transformer/data/Stanford40/JPEGImages"
    labels_path = "/home/lananh/Desktop/vision_transformer/data/Stanford40/ImageSplits"
    new_dataset_path = "/home/lananh/Desktop/vision_transformer/StanfordActionDataset"

    if not (os.path.exists(new_dataset_path)):
        os.mkdir(new_dataset_path)
        os.mkdir(new_dataset_path + '/' + 'train')
        os.mkdir(new_dataset_path + '/' + 'test')

    txts = os.listdir(labels_path)
    for txt in txts:
        idx = txt[0:-4].rfind('_')
        class_name = txt[0:idx]
        if class_name in ['actions.tx', 'test.tx', 'train.tx']:
            continue
        train_or_test = txt[idx + 1:-4]
        txt_contents = open(labels_path + '/' + txt)
        txt_contents = txt_contents.read()
        image_names = txt_contents.split('\n')
        num_aid_images_per_class = 1
        for image_name in image_names[0:-1]:
            if not (os.path.exists(new_dataset_path + '/' + train_or_test + '/' + class_name)):
                os.mkdir(new_dataset_path + '/' + train_or_test + '/' + class_name)
            copyfile(images_path + '/' + image_name,
                     new_dataset_path + '/' + train_or_test + '/' + class_name + '/' + image_name)

In [None]:
creating_dataset()

In [None]:
# Create training and testing dataset
stan40_train= datasets.ImageFolder(root='/home/lananh/Desktop/vision_transformer/StanfordActionDataset/train',
                                 transform=transforms.Compose([
                                        transforms.Resize((img_size, img_size)),
                                        transforms.ToTensor(),
                                        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                                    ])
                                )

stan40_test = datasets.ImageFolder(root='/home/lananh/Desktop/vision_transformer/StanfordActionDataset/test',
                                 transform=transforms.Compose([
                                        transforms.Resize((img_size, img_size)),
                                        transforms.ToTensor(),
                                        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                                    ])
                                )


num_classes_train = len(stan40_train.classes)
print(num_classes_train)




In [None]:
import os
import time
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Set up the dataset and dataloader
train_loader = DataLoader(stan40_train, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(stan40_test, batch_size=64, shuffle=False, num_workers=4)

# Initialize the model
model = Vit_transformer(
    img_size=32, patch_size=16, in_channels=3, embeded_dim=480,
    depth=6, n_heads=12, mlp_ratio=4, qkv_bias=True, p=0.3, attn_p=0.3, n_classes=num_classes_train
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.02, momentum=0.95)

num_epochs = 100

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    total_train = 0

    for batch_idx, (data, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = output.max(1)
        total_train += targets.size(0)
        train_correct += predicted.eq(targets).sum().item()

    train_acc = 100. * train_correct / total_train
    train_loss /= len(train_loader)

    # Evaluation on the test set
    model.eval()
    test_loss = 0
    test_correct = 0
    total_test = 0

    with torch.no_grad():
        for batch_idx, (data, targets) in enumerate(test_loader):
            data, targets = data.to(device), targets.to(device)
            output = model(data)
            loss = criterion(output, targets)

            test_loss += loss.item()
            _, predicted = output.max(1)
            total_test += targets.size(0)
            test_correct += predicted.eq(targets).sum().item()

    test_acc = 100. * test_correct / total_test
    test_loss /= len(test_loader)

    print(f"Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")