In [2]:
# Cell 1: Loading the Kaggle Dataset (1000 Folders) as Validation (10% Subset)

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np

# Define transforms (matching typical ImageNet pre-processing)
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# Set your dataset path (each subfolder is one of 1000 classes)
dataset_path = 'imgnet'  # <-- update this to your dataset folder

# Load the dataset using ImageFolder
dataset = datasets.ImageFolder(root=dataset_path, transform=transform)

# Check the dataset structure
print("Total samples:", len(dataset))
print("Number of classes:", len(dataset.classes))  # Should be 1000

# Create a 10% subset of the dataset for inference
subset_size = int(0.1 * len(dataset))
np.random.seed(42)  # For reproducibility
indices = np.random.permutation(len(dataset))[:subset_size]
subset_dataset = Subset(dataset, indices)
print("Subset samples (10%):", len(subset_dataset))

# Create a DataLoader for inference (adjust batch_size as needed)
val_loader = DataLoader(subset_dataset, batch_size=64, shuffle=False, num_workers=4)


Total samples: 50000
Number of classes: 1000
Subset samples (10%): 5000


In [12]:
# Cell 2: Fetching the Models Trained on ImageNet-1K

import timm
import torch

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load ViT‑Base model (pre‑trained on ImageNet‑1K)
vit_base = timm.create_model('vit_base_patch16_224', pretrained=True)
vit_base = vit_base.to(device)
vit_base.eval()

# Load ResNet‑50 model (pre‑trained on ImageNet‑1K)
resnet50 = timm.create_model('resnet50', pretrained=True)
resnet50 = resnet50.to(device)
resnet50.eval()

# Load BEiT‑Base model (pre‑trained on ImageNet‑1K)
beit_base = timm.create_model('beitv2_large_patch16_224', pretrained=True)
beit_base = beit_base.to(device)
beit_base.eval()




print("Loaded models: ViT‑Base, ResNet‑50, and BEiT‑V2 (all trained on ImageNet‑1K)")


Using device: cuda
Loaded models: ViT‑Base, ResNet‑50, and BEiT‑V2 (all trained on ImageNet‑1K)


In [6]:
timm.list_models()

['aimv2_1b_patch14_224',
 'aimv2_1b_patch14_336',
 'aimv2_1b_patch14_448',
 'aimv2_3b_patch14_224',
 'aimv2_3b_patch14_336',
 'aimv2_3b_patch14_448',
 'aimv2_huge_patch14_224',
 'aimv2_huge_patch14_336',
 'aimv2_huge_patch14_448',
 'aimv2_large_patch14_224',
 'aimv2_large_patch14_336',
 'aimv2_large_patch14_448',
 'bat_resnext26ts',
 'beit_base_patch16_224',
 'beit_base_patch16_384',
 'beit_large_patch16_224',
 'beit_large_patch16_384',
 'beit_large_patch16_512',
 'beitv2_base_patch16_224',
 'beitv2_large_patch16_224',
 'botnet26t_256',
 'botnet50ts_256',
 'caformer_b36',
 'caformer_m36',
 'caformer_s18',
 'caformer_s36',
 'cait_m36_384',
 'cait_m48_448',
 'cait_s24_224',
 'cait_s24_384',
 'cait_s36_384',
 'cait_xs24_384',
 'cait_xxs24_224',
 'cait_xxs24_384',
 'cait_xxs36_224',
 'cait_xxs36_384',
 'coat_lite_medium',
 'coat_lite_medium_384',
 'coat_lite_mini',
 'coat_lite_small',
 'coat_lite_tiny',
 'coat_mini',
 'coat_small',
 'coat_tiny',
 'coatnet_0_224',
 'coatnet_0_rw_224',
 'coa

In [13]:
# Cell 3: Inference and Evaluation for All Models

import time
import torch

def evaluate_model(model, dataloader, device):
    top1_correct = 0
    top5_correct = 0
    total = 0
    
    start_time = time.time()
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            # Get top-5 predictions
            _, pred = outputs.topk(5, dim=1, largest=True, sorted=True)
            pred = pred.t()  # shape: [5, batch_size]
            # Compare predictions with ground truth
            correct = pred.eq(labels.view(1, -1).expand_as(pred))
            top1_correct += correct[:1].reshape(-1).float().sum().item()
            top5_correct += correct[:5].reshape(-1).float().sum().item()
            total += labels.size(0)
    end_time = time.time()
    
    inference_time = end_time - start_time
    top1_acc = 100 * top1_correct / total
    top5_acc = 100 * top5_correct / total
    
    return top1_acc, top5_acc, inference_time

# Evaluate each model on the validation DataLoader (val_loader)
vit_top1, vit_top5, vit_time = evaluate_model(vit_base, val_loader, device)
resnet_top1, resnet_top5, resnet_time = evaluate_model(resnet50, val_loader, device)
beit_top1, beit_top5, beit_time = evaluate_model(beit_base, val_loader, device)

print("ViT‑Base (ImageNet‑1K): Top-1: {:.2f}%, Top-5: {:.2f}%, Inference time: {:.2f} sec".format(vit_top1, vit_top5, vit_time))
print("ResNet‑50 (ImageNet‑1K): Top-1: {:.2f}%, Top-5: {:.2f}%, Inference time: {:.2f} sec".format(resnet_top1, resnet_top5, resnet_time))
print("BEiT‑Base (ImageNet‑1K): Top-1: {:.2f}%, Top-5: {:.2f}%, Inference time: {:.2f} sec".format(beit_top1, beit_top5, beit_time))


ViT‑Base (ImageNet‑1K): Top-1: 80.30%, Top-5: 95.60%, Inference time: 48.76 sec
ResNet‑50 (ImageNet‑1K): Top-1: 78.76%, Top-5: 94.18%, Inference time: 24.29 sec
BEiT‑Base (ImageNet‑1K): Top-1: 87.70%, Top-5: 98.42%, Inference time: 121.97 sec


In [None]:
from models.t2t_vit import *
from utils import load_for_transfer_learning 

# create model
model = t2t_vit_14()

# load the pretrained weights
load_for_transfer_learning(model,"", use_ema=True, strict=False, num_classes=1000)  # change num_classes based on dataset, can work for different image size as we interpolate the position embeding for different image size.