# Exploration and Comparison of Transformers for Image Classification

## CLIP (Contrastive Language-Image Pre-training)


### Prerequisites

In [13]:
import os
os.chdir('..')

import torch
import torch.nn as nn

from transformers import CLIPImageProcessor, CLIPTokenizer
from datasets import load_dataset, concatenate_datasets

from src.dataset_builder import ImageDataset
from src.models import CLIP
from src.train import train_model, evaluate_model

from utils.config import Config
from utils.train_utils import *
from utils.models_utils import *

### GPU

In [5]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        vram = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)
        print(f"  VRAM: {vram:.2f} GB")
else:
    print("CUDA is not available.")

GPU 0: NVIDIA A100 80GB PCIe MIG 2g.20gb
  VRAM: 19.50 GB


#### Data preparation

In [6]:
resisc45 = load_dataset('timm/resisc45')
resisc45_combined = concatenate_datasets([resisc45[split] for split in resisc45.keys()])

README.md:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/85.1M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/85.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18900 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6300 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6300 [00:00<?, ? examples/s]

In [7]:
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [8]:
dataset = ImageDataset(
    dataset=resisc45_combined,
    processor=processor,
    tokenizer=tokenizer,
    create_captions=True,
    prompt="satellite imagery of {}.",
)

In [9]:
dataset.get_captions()

['satellite imagery of airplane.',
 'satellite imagery of airport.',
 'satellite imagery of baseball_diamond.',
 'satellite imagery of basketball_court.',
 'satellite imagery of beach.',
 'satellite imagery of bridge.',
 'satellite imagery of chaparral.',
 'satellite imagery of church.',
 'satellite imagery of circular_farmland.',
 'satellite imagery of cloud.',
 'satellite imagery of commercial_area.',
 'satellite imagery of dense_residential.',
 'satellite imagery of desert.',
 'satellite imagery of forest.',
 'satellite imagery of freeway.',
 'satellite imagery of golf_course.',
 'satellite imagery of ground_track_field.',
 'satellite imagery of harbor.',
 'satellite imagery of industrial_area.',
 'satellite imagery of intersection.',
 'satellite imagery of island.',
 'satellite imagery of lake.',
 'satellite imagery of meadow.',
 'satellite imagery of medium_residential.',
 'satellite imagery of mobile_home_park.',
 'satellite imagery of mountain.',
 'satellite imagery of overpass.

### Model

In [10]:
config = Config()
model = CLIP()

config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [11]:
print(f'Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M')

Parameters: 149.62M


### Zero-shot transfer

In [12]:
evaluate_model(
    model,
    dataset,
    config,
    zero_shot=True
)

Zero-shot: 100%|██████████| 3938/3938 [07:08<00:00,  9.20it/s]

Zero-shot evaluation completed: loss: 1.2927 | acc: 0.6049



