# Exploration and Comparison of Transformers for Image Classification

## ViT (Vision Transformer)


### Prerequisites

In [2]:
import os
os.chdir('..')

import torch
import torch.nn as nn

from transformers import ViTImageProcessor
from datasets import load_dataset, concatenate_datasets

from src.dataset_builder import ImageDataset
from src.models import Backbone
from src.train import train_model, evaluate_model

from utils.config import Config
from utils.train_utils import *
from utils.models_utils import *

### GPU

In [3]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        vram = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)
        print(f"  VRAM: {vram:.2f} GB")
else:
    print("CUDA is not available.")

GPU 0: NVIDIA A100 80GB PCIe MIG 1g.10gb
  VRAM: 9.50 GB


#### Data preparation

In [5]:
train, val, test = load_dataset('timm/resisc45', split=['train', 'validation', 'test'])

In [6]:
processor = ViTImageProcessor.from_pretrained(model_names['ViT'])

In [7]:
train_split = ImageDataset(dataset=train, processor=processor)
val_split = ImageDataset(dataset=val, processor=processor)
test_split = ImageDataset(dataset=test, processor=processor)

In [11]:
num_classes = train_split.get_num_classes()

### Model

In [15]:
config = Config()
model = Backbone(model_name=model_names['ViT'], num_classes=num_classes)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/deit-small-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 384]) in the checkpoint and torch.Size([45, 384]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([45]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
get_model_params(model)

Parameters: 21.68M


### Linear probing

In [17]:
train_model(
    model,
    train_split,
    val_split,
    config,
    architecture='vit',
    fine_tune=False,
)

Train: 100%|██████████| 2363/2363 [02:36<00:00, 15.06it/s]
Val: 100%|██████████| 788/788 [00:52<00:00, 15.04it/s]


Epochs: 1/2 | train_loss: 2.9163 | train_acc: 0.4157 | val_loss: 2.1953 | val_acc: 0.6373


Train: 100%|██████████| 2363/2363 [02:34<00:00, 15.28it/s]
Val: 100%|██████████| 788/788 [00:52<00:00, 15.04it/s]

Epochs: 2/2 | train_loss: 1.8213 | train_acc: 0.6792 | val_loss: 1.5356 | val_acc: 0.7044





### Evaluation

In [18]:
evaluate_model(
    model,
    test_split,
    config,
)

Test: 100%|██████████| 788/788 [00:52<00:00, 15.07it/s]

test_loss: 1.5717 | test_acc: 0.6930



