In [None]:
import os
import glob
import random
import numpy as np
import cv2
import math
from PIL import Image

import timm

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.models as models
from torchvision import transforms
from torchvision.transforms import RandAugment, Compose
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm # for progress bar stuff
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class LabeledDataset(Dataset):
    def __init__(self, folder_path, classes, transform=None):
        """
        Args:
            folder_path (str): Path to the folder containing labeled images
            and subdirectories.

            classes (list): List of class keywords.

            transform (callable, optional):
            Optional transform to apply to images.
        """
        self.folder_path = folder_path
        self.classes = classes
        self.transform = transform

        # Recursively collect all image paths from subdirectories
        self.image_paths = []
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith((".jpg", ".jpeg", ".JPG")):
                    self.image_paths.append(os.path.join(root, file))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")

        # Extract class label from filename
        label = None
        for i, cls in enumerate(self.classes):
            if cls in os.path.basename(img_path):
                label = i
                break

        if label is None:
            raise ValueError(f"Class label not found in file name: {img_path}")

        # Apply transforms
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)


In [None]:
# Configuration
class Config:
    NUM_CLASSES = 9  # Update based on your dataset
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    BATCH_SIZE = 64
    EPOCHS = 7
    LR = 1e-4
    LABELED_TRAIN_FOLDER = '/content/drive/MyDrive/CMSC_472_Final_Project/extra_photos/extra_train'
    LABELED_TEST_FOLDER = '/content/drive/MyDrive/CMSC_472_Final_Project/extra_photos/extra_test'
    # CLASSES = [
    #     "art_sociology", "atlantic", "brendan_iribe", "esj", "farm",
    #     "mckeldinlib", "physics", "prince_frederick", "reckord_armory",
    #     "regents_drive", "yahentamitsi_dinning"
    # ]
    CLASSES = [
        "denton", "elkton", "ellicott", "hagerstown", "james_clark", "laplata",
        "manufacture", "oakland", "recreation"
    ]

# Pretrained Model
vit_model = timm.create_model(
    'vit_base_patch16_224',  # Vision Transformer model
    pretrained=True,        # Load pretrained weights
    num_classes=Config.NUM_CLASSES  # Match your dataset's number of classes
).to(Config.DEVICE)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vit_model.parameters(), lr=Config.LR)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = LabeledDataset(Config.LABELED_TRAIN_FOLDER, Config.CLASSES, transform=transform)
test_dataset = LabeledDataset(Config.LABELED_TEST_FOLDER, Config.CLASSES, transform=transform)

train_loader = DataLoader(train_dataset,batch_size=Config.BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
# for name, module in vit_model.named_modules():
#     print(name)

In [None]:
# Training Loop with Progress Bar and Checkpoints
from tqdm import tqdm

def train_and_evaluate(model, train_loader, test_loader, criterion, optimizer, epochs, device, checkpoint_dir="./checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)
    best_accuracy = 0.0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for images, labels in loop:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)

            # Update progress bar
            loop.set_postfix(loss=train_loss/len(train_loader))

        train_accuracy = correct_train / total_train
        print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_accuracy:.4f}")

        # Save checkpoint if performance improves
        test_accuracy = calculate_accuracy(model, test_loader, device)
        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            checkpoint_path = os.path.join(checkpoint_dir, f"best_vit.pth")
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Saved Best Model with Test Acc: {best_accuracy:.4f}")

    print(f"Final Test Accuracy: {best_accuracy:.4f}")

# Function to Calculate Accuracy
def calculate_accuracy(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Train the Pretrained Model
train_and_evaluate(
    vit_model,
    train_loader,
    test_loader,
    criterion,
    optimizer,
    Config.EPOCHS,
    Config.DEVICE
)


Epoch 1/7: 100%|██████████| 7/7 [05:16<00:00, 45.23s/it, loss=2.31]


Epoch 1/7, Loss: 2.3071, Train Acc: 0.1949
Saved Best Model with Test Acc: 0.4035


Epoch 2/7: 100%|██████████| 7/7 [02:06<00:00, 18.00s/it, loss=1.24]


Epoch 2/7, Loss: 1.2423, Train Acc: 0.5777
Saved Best Model with Test Acc: 0.8596


Epoch 3/7: 100%|██████████| 7/7 [02:01<00:00, 17.41s/it, loss=0.333]


Epoch 3/7, Loss: 0.3325, Train Acc: 0.8933


Epoch 4/7: 100%|██████████| 7/7 [02:04<00:00, 17.74s/it, loss=0.135]


Epoch 4/7, Loss: 0.1352, Train Acc: 0.9374


Epoch 5/7: 100%|██████████| 7/7 [02:04<00:00, 17.82s/it, loss=0.0997]


Epoch 5/7, Loss: 0.0997, Train Acc: 0.9675
Saved Best Model with Test Acc: 0.8947


Epoch 6/7: 100%|██████████| 7/7 [02:05<00:00, 17.98s/it, loss=0.0424]


Epoch 6/7, Loss: 0.0424, Train Acc: 0.9861


Epoch 7/7: 100%|██████████| 7/7 [02:06<00:00, 18.05s/it, loss=0.0547]


Epoch 7/7, Loss: 0.0547, Train Acc: 0.9884
Final Test Accuracy: 0.8947


In [None]:
# print(vit_model)

In [None]:
!pip install grad-cam

Collecting grad-cam
  Downloading grad-cam-1.5.4.tar.gz (7.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/7.8 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m4.0/7.8 MB[0m [31m58.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.8/7.8 MB[0m [31m78.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ttach (from grad-cam)
  Downloading ttach-0.0.3-py3-none-any.whl.metadata (5.2 kB)
Downloading ttach-0.0.3-py3-no

In [None]:
import cv2
import numpy as np
import torch
import timm
from pytorch_grad_cam import GradCAM, GradCAMPlusPlus, ScoreCAM, XGradCAM, EigenCAM
from pytorch_grad_cam.utils.image import show_cam_on_image, preprocess_image

# Custom Config class for settings
class Config:
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    IMAGE_PATH = '/content/drive/MyDrive/CMSC_472_Final_Project/extra_photos/extra_test/recreation_test/recreation_building_jerry_2024-11-27_01-44-51_0_train_1.jpg'
    MODEL_PATH = './checkpoints/best_vit.pth'
    METHOD = 'gradcam'  # Options: gradcam, gradcam++, scorecam, xgradcam, eigencam
    AUG_SMOOTH = False
    EIGEN_SMOOTH = False

def reshape_transform(tensor, height=14, width=14):
    """
    Transform the output of a Vision Transformer for Grad-CAM processing.
    """
    result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
    result = result.transpose(2, 3).transpose(1, 2)  # Bring channels to the first dimension
    return result


# Load pre-trained Vision Transformer model
vit_model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=9)
vit_model.load_state_dict(torch.load(Config.MODEL_PATH, map_location=Config.DEVICE))
vit_model = vit_model.to(Config.DEVICE).eval()

# Specify the target layer for Grad-CAM
target_layers = [vit_model.blocks[-1].norm1]

# Select Grad-CAM method
methods = {
    "gradcam": GradCAM,
    "gradcam++": GradCAMPlusPlus,
    "scorecam": ScoreCAM,
    "xgradcam": XGradCAM,
    "eigencam": EigenCAM,
}
cam_method = methods[Config.METHOD]

# Initialize Grad-CAM
cam = cam_method(
    model=vit_model,
    target_layers=target_layers,
    reshape_transform=reshape_transform,
)

# Read and preprocess the input image
rgb_img = cv2.imread(Config.IMAGE_PATH, 1)[:, :, ::-1]  # Convert BGR to RGB
rgb_img = cv2.resize(rgb_img, (224, 224))
rgb_img = np.float32(rgb_img) / 255
input_tensor = preprocess_image(rgb_img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]).to(Config.DEVICE)

# Generate Grad-CAM visualization
grayscale_cam = cam(input_tensor=input_tensor, targets=None, eigen_smooth=Config.EIGEN_SMOOTH, aug_smooth=Config.AUG_SMOOTH)
grayscale_cam = grayscale_cam[0, :]  # Process the first image in the batch

# Overlay the Grad-CAM heatmap on the input image
cam_image = show_cam_on_image(rgb_img, grayscale_cam)
output_path = f"{Config.METHOD}_cam.jpg"
cv2.imwrite(output_path, cam_image)

print(f"Grad-CAM visualization saved to {output_path}")


  vit_model.load_state_dict(torch.load(Config.MODEL_PATH, map_location=Config.DEVICE))


Grad-CAM visualization saved to gradcam_cam.jpg
