# Exploration and Comparison of Transformers for Image Classification

## Installs

In [2]:
!pip install -r requirements.txt

Collecting torch (from -r requirements.txt (line 1))
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision (from -r requirements.txt (line 2))
  Downloading torchvision-0.19.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting transformers (from -r requirements.txt (line 3))
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from -r requirements.txt (line 4))
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting filelock (from torch->-r requirements.txt (line 1))
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1

## Imports

In [17]:
import torch
import torch.nn as nn

from transformers import ViTImageProcessorFast
from datasets import load_dataset

from src.custom_dataset import ImageClassificationDataset
from src.models import ViT
from src.train import linear_probe, evaluate


from utils.preprocessing import get_data
from utils.config import Config

## GPU

In [5]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        vram = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)
        print(f"  VRAM: {vram:.2f} GB")
else:
    print("CUDA is not available.")

GPU 0: NVIDIA A100 80GB PCIe MIG 1g.10gb
  VRAM: 9.50 GB


## Experiments

In [6]:
# Experiments start from now on...

In [9]:
train, val, test = load_dataset('timm/resisc45', split=['train', 'validation', 'test'])
vit_processor = ViTImageProcessorFast.from_pretrained('facebook/deit-small-patch16-224')

train_loader, val_loader, test_loader = get_data(ImageClassificationDataset, vit_processor, train, val, test)

Downloading readme:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18900 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6300 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6300 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [12]:
model = ViT(num_classes=45)

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/88.3M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/deit-small-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 384]) in the checkpoint and torch.Size([45, 384]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([45]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
config = Config()

In [None]:
linear_probe(model, train_loader, val_loader, config)

In [None]:
evaluate(model, test_loader, config)

In [None]:
# first exp