<a href="https://colab.research.google.com/github/lee-hanhee/Machine-Learning/blob/main/VisionTransformers_Finetunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries


In [None]:
!pip install torch torchvision timm xgboost scikit-learn pandas tqdm torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collectin

# Basic imports

In [None]:
import numpy as np
import xgboost as xgb
import sklearn.model_selection
import sklearn.metrics
import pandas as pd
import tqdm

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torchinfo
import timm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Data Loading and Preprocessing
# Fashion-MNIST dataset


In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize for ResNet and ViT
    transforms.Grayscale(num_output_channels=3),  # Convert to 3-channel grayscale
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize for 3 channels
])

trainset = torchvision.datasets.FashionMNIST(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.FashionMNIST(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2) # shuffle=False important for embeddings

classes = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
           'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:02<00:00, 12.7MB/s]


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 203kB/s]


Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:01<00:00, 3.80MB/s]


Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 16.5MB/s]

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw






# 2. Model Definitions
# ResNet18 (pretrained) - Finetuning and Embedding


In [None]:
resnet18_ft = timm.create_model('resnet18', pretrained=True, num_classes=10)  # Finetuning
resnet18_ft = resnet18_ft.to(device)
# Freeze all layers except the last one
for name, param in resnet18_ft.named_parameters():
    if name == 'fc.weight' or name == 'fc.bias':  # For ResNet, the last layer is usually 'fc'
        param.requires_grad = True  # Unfreeze the last layer
    else:
        param.requires_grad = False # Freeze all other layers


resnet18_emb = timm.create_model('resnet18', pretrained=True, num_classes=0) # Embedding
resnet18_emb = resnet18_emb.to(device)
resnet18_emb.eval()
torchinfo.summary(resnet18_ft, input_size=(1, 3, 224, 224))
# 5130 - 10 (bias term since 10 classes) = 5120/10 (# of classes) = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 10]                   --
├─Conv2d: 1-1                            [1, 64, 112, 112]         (9,408)
├─BatchNorm2d: 1-2                       [1, 64, 112, 112]         (128)
├─ReLU: 1-3                              [1, 64, 112, 112]         --
├─MaxPool2d: 1-4                         [1, 64, 56, 56]           --
├─Sequential: 1-5                        [1, 64, 56, 56]           --
│    └─BasicBlock: 2-1                   [1, 64, 56, 56]           --
│    │    └─Conv2d: 3-1                  [1, 64, 56, 56]           (36,864)
│    │    └─BatchNorm2d: 3-2             [1, 64, 56, 56]           (128)
│    │    └─Identity: 3-3                [1, 64, 56, 56]           --
│    │    └─ReLU: 3-4                    [1, 64, 56, 56]           --
│    │    └─Identity: 3-5                [1, 64, 56, 56]           --
│    │    └─Conv2d: 3-6                  [1, 64, 56, 56]           (

# ViT-Base (pretrained) - Finetuning and Embedding

In [None]:
vit_ft = timm.create_model('vit_tiny_patch16_224', pretrained=True, num_classes=10) # Finetuning
vit_ft = vit_ft.to(device)
# Freeze all layers except the last one
for name, param in vit_ft.named_parameters():
   if 'head' in name: #For ViT, the last layer is usually 'head'
       param.requires_grad = True
   else:
       param.requires_grad = False

vit_emb = timm.create_model('vit_tiny_patch16_224', pretrained=True, num_classes=0) # Embedding
vit_emb = vit_emb.to(device)
vit_emb.eval()
torchinfo.summary(vit_ft, input_size=(1, 3, 224, 224))
1930 - 10 = 1920 /10 = 192

Layer (type:depth-idx)                   Output Shape              Param #
VisionTransformer                        [1, 10]                   38,016
├─PatchEmbed: 1-1                        [1, 196, 192]             --
│    └─Conv2d: 2-1                       [1, 192, 14, 14]          (147,648)
│    └─Identity: 2-2                     [1, 196, 192]             --
├─Dropout: 1-2                           [1, 197, 192]             --
├─Identity: 1-3                          [1, 197, 192]             --
├─Identity: 1-4                          [1, 197, 192]             --
├─Sequential: 1-5                        [1, 197, 192]             --
│    └─Block: 2-3                        [1, 197, 192]             --
│    │    └─LayerNorm: 3-1               [1, 197, 192]             (384)
│    │    └─Attention: 3-2               [1, 197, 192]             (148,224)
│    │    └─Identity: 3-3                [1, 197, 192]             --
│    │    └─Identity: 3-4                [1, 197, 192]          

# 3. Loss Function and Optimizers

In [None]:
criterion = nn.CrossEntropyLoss()
# Optimizers
optimizer_resnet_ft = optim.Adam(filter(lambda p: p.requires_grad, resnet18_ft.parameters()), lr=0.001)
optimizer_vit_ft = optim.Adam(filter(lambda p: p.requires_grad, vit_ft.parameters()), lr=0.001)

# 4. Training Loop (Finetuning)

In [None]:
def train(model, trainloader, optimizer, criterion, device, epochs=2):
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(tqdm.tqdm(trainloader, desc=f"Epoch {epoch+1}"), 0): # Use tqdm
            inputs, labels = data[0].to(device), data[1].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 200 == 199:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 200:.3f}')
                running_loss = 0.0
    print('Finished Training')

print("Starting ResNet18 Finetuning...")
train(resnet18_ft, trainloader, optimizer_resnet_ft, criterion, device, epochs=2)

print("Starting ViT-Tiny Finetuning...")
train(vit_ft, trainloader, optimizer_vit_ft, criterion, device, epochs=2)

Starting ResNet18 Finetuning...


Epoch 1:  21%|██▏       | 201/938 [00:24<01:20,  9.20it/s]

[1,   200] loss: 0.522


Epoch 1:  43%|████▎     | 401/938 [00:50<00:58,  9.13it/s]

[1,   400] loss: 0.525


Epoch 1:  64%|██████▍   | 601/938 [01:14<00:36,  9.33it/s]

[1,   600] loss: 0.517


Epoch 1:  85%|████████▌ | 801/938 [01:40<00:15,  8.61it/s]

[1,   800] loss: 0.499


Epoch 1: 100%|██████████| 938/938 [01:57<00:00,  8.00it/s]
Epoch 2:  21%|██▏       | 201/938 [00:25<01:27,  8.47it/s]

[2,   200] loss: 0.476


Epoch 2:  43%|████▎     | 401/938 [00:51<01:07,  7.99it/s]

[2,   400] loss: 0.493


Epoch 2:  64%|██████▍   | 601/938 [01:16<00:37,  8.98it/s]

[2,   600] loss: 0.480


Epoch 2:  85%|████████▌ | 800/938 [01:41<00:17,  7.75it/s]

[2,   800] loss: 0.472


Epoch 2: 100%|██████████| 938/938 [01:59<00:00,  7.82it/s]


Finished Training
Starting ViT-Tiny Finetuning...


Epoch 1:  21%|██▏       | 201/938 [00:26<01:29,  8.22it/s]

[1,   200] loss: 0.600


Epoch 1:  43%|████▎     | 401/938 [00:52<01:10,  7.59it/s]

[1,   400] loss: 0.380


Epoch 1:  64%|██████▍   | 601/938 [01:18<00:40,  8.35it/s]

[1,   600] loss: 0.344


Epoch 1:  85%|████████▌ | 801/938 [01:44<00:15,  9.09it/s]

[1,   800] loss: 0.339


Epoch 1: 100%|██████████| 938/938 [02:02<00:00,  7.66it/s]
Epoch 2:  21%|██▏       | 200/938 [00:26<01:34,  7.82it/s]

[2,   200] loss: 0.319


Epoch 2:  43%|████▎     | 400/938 [00:53<01:33,  5.73it/s]

[2,   400] loss: 0.343


Epoch 2:  64%|██████▍   | 601/938 [01:20<00:49,  6.87it/s]

[2,   600] loss: 0.310


Epoch 2:  85%|████████▌ | 801/938 [01:46<00:18,  7.50it/s]

[2,   800] loss: 0.323


Epoch 2: 100%|██████████| 938/938 [02:04<00:00,  7.56it/s]


Finished Training


# 5. Evaluation (Finetuning)

In [None]:
def evaluate(model, testloader, device):
    model.eval() # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 100 * correct / total

results = []

results.append({'base model':'ResNet18',
                'prediction': 'Finetuned - Last Layer Only',
                'accuracy' : evaluate(resnet18_ft, testloader, device)})

results.append({'base model':'ViT-Tiny',
                'prediction': 'Finetuned - Last Layer Only',
                'accuracy' : evaluate(vit_ft, testloader, device)})
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,base model,prediction,accuracy
0,ResNet18,Finetuned - Last Layer Only,82.51
1,ViT-Tiny,Finetuned - Last Layer Only,87.72


# 6. Embedding Extraction

In [None]:
def extract_embeddings(model, dataloader, device):
    embeddings = []
    labels = []
    with torch.no_grad():
        for images, label_batch in tqdm.tqdm(dataloader, desc="Extracting Embeddings"):
            images = images.to(device)
            output = model(images)  # Get embeddings
            embeddings.append(output.cpu().numpy())
            labels.append(label_batch.cpu().numpy())

    return np.concatenate(embeddings), np.concatenate(labels)

print("Extracting ResNet18 embeddings...")
resnet_embeddings_train, train_labels = extract_embeddings(resnet18_emb, trainloader, device)
print(resnet_embeddings_train.shape)
resnet_embeddings_test, test_labels = extract_embeddings(resnet18_emb, testloader, device)

print("Extracting ViT-Tiny embeddings...")
vit_embeddings_train, train_labels = extract_embeddings(vit_emb, trainloader, device)
print(vit_embeddings_train.shape)
vit_embeddings_test, test_labels = extract_embeddings(vit_emb, testloader, device)

Extracting ResNet18 embeddings...


Extracting Embeddings: 100%|██████████| 938/938 [01:59<00:00,  7.83it/s]
Extracting Embeddings: 100%|██████████| 157/157 [00:18<00:00,  8.47it/s]


Extracting ViT-Tiny embeddings...


Extracting Embeddings: 100%|██████████| 938/938 [01:55<00:00,  8.15it/s]
Extracting Embeddings: 100%|██████████| 157/157 [00:21<00:00,  7.45it/s]


# 7. Train XGBoost on Embeddings

In [None]:
def train_xgboost(embeddings_train, train_labels, embeddings_test, test_labels):
    xgboost_model = xgb.XGBClassifier(objective='multi:softmax',
                                      n_estimators=50,
                                      tree_method='hist', n_jobs=-1,
                                      num_class=10, eval_metric='mlogloss')
    xgboost_model.fit(embeddings_train, train_labels)
    predictions = xgboost_model.predict(embeddings_test)
    accuracy = sklearn.metrics.accuracy_score(test_labels, predictions)
    return accuracy * 100

results.append({'base model':'ResNet18',
                'prediction': 'XGBoost',
                'accuracy' : train_xgboost(resnet_embeddings_train, train_labels, resnet_embeddings_test, test_labels)})

results.append({'base model':'ViT-Tiny',
                'prediction': 'XGBoost',
                'accuracy' : train_xgboost(vit_embeddings_train, train_labels, vit_embeddings_test, test_labels)})
results_df = pd.DataFrame(results)
results_df