# Three-Stage Progressive Training for Bone Fracture Classification

## Overview
This notebook implements a comprehensive three-stage training approach:

### Stage 1: Linear Probing
- Extract features from frozen DINOv2 backbone
- Train classification heads of varying depths
- Select best performing architecture

### Stage 2: Partial Fine-Tuning
- Freeze trained classifier
- Selectively train backbone layers (LoRA, top/bottom/middle layers)
- Apply aggressive data augmentation to reduce overfitting

### Stage 3: Full Fine-Tuning
- Unfreeze all layers
- Train entire model end-to-end with very low learning rate
- Final optimization for best performance

**Dataset**: Bone Break Classification (X-ray Images)  
**Approach**: Progressive unfreezing with balanced class distribution

In [3]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
Device: Tesla T4


## 1. Data Loading and Preprocessing

Load all images from both Train and Test directories, merge them, and create a stratified 60/20/20 split.

In [4]:
# Dataset paths
DATASET_ROOT = "/kaggle/input/bone-break-classification-image-dataset/Bone Break Classification/Bone Break Classification"

# Collect all image paths and labels
all_image_paths = []
all_labels = []
class_names = sorted(os.listdir(DATASET_ROOT))

print("Loading dataset from all directories...")
for class_name in class_names:
    class_path = os.path.join(DATASET_ROOT, class_name)
    
    # Collect from Train directory
    train_path = os.path.join(class_path, "Train")
    if os.path.exists(train_path):
        for img_name in os.listdir(train_path):
            img_path = os.path.join(train_path, img_name)
            if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
                all_image_paths.append(img_path)
                all_labels.append(class_name)
    
    # Collect from Test directory
    test_path = os.path.join(class_path, "Test")
    if os.path.exists(test_path):
        for img_name in os.listdir(test_path):
            img_path = os.path.join(test_path, img_name)
            if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
                all_image_paths.append(img_path)
                all_labels.append(class_name)

print(f"\nTotal images collected: {len(all_image_paths)}")
print(f"Number of classes: {len(class_names)}")
print(f"\nClass names: {class_names}")

# Show class distribution
class_counts = Counter(all_labels)
print("\nClass distribution:")
for class_name in class_names:
    print(f"  {class_name}: {class_counts[class_name]} images")

Loading dataset from all directories...

Total images collected: 1129
Number of classes: 10

Class names: ['Avulsion fracture', 'Comminuted fracture', 'Fracture Dislocation', 'Greenstick fracture', 'Hairline Fracture', 'Impacted fracture', 'Longitudinal fracture', 'Oblique fracture', 'Pathological fracture', 'Spiral Fracture']

Class distribution:
  Avulsion fracture: 123 images
  Comminuted fracture: 148 images
  Fracture Dislocation: 156 images
  Greenstick fracture: 122 images
  Hairline Fracture: 111 images
  Impacted fracture: 84 images
  Longitudinal fracture: 80 images
  Oblique fracture: 85 images
  Pathological fracture: 134 images
  Spiral Fracture: 86 images


In [7]:
# Create label encoding
label_to_idx = {label: idx for idx, label in enumerate(class_names)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
numeric_labels = [label_to_idx[label] for label in all_labels]

# Perform stratified 60/20/20 split
# First split: 60% train, 40% temp
train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    all_image_paths, numeric_labels, 
    test_size=0.4, 
    stratify=numeric_labels, 
    random_state=42
)

# Second split: 40% temp -> 20% val, 20% test
val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels,
    test_size=0.5,
    stratify=temp_labels,
    random_state=42
)

print("\n" + "="*60)
print("STRATIFIED DATA SPLIT (60/20/20)")
print("="*60)
print(f"\nTrain set: {len(train_paths)} images ({len(train_paths)/len(all_image_paths)*100:.1f}%)")
print(f"Val set:   {len(val_paths)} images ({len(val_paths)/len(all_image_paths)*100:.1f}%)")
print(f"Test set:  {len(test_paths)} images ({len(test_paths)/len(all_image_paths)*100:.1f}%)")

# Verify class balance
print("\nClass distribution per split:")
for split_name, split_labels in [("Train", train_labels), ("Val", val_labels), ("Test", test_labels)]:
    counts = Counter(split_labels)
    print(f"\n{split_name}:")
    for idx in sorted(counts.keys()):
        class_name = idx_to_label[idx]
        print(f"  {class_name}: {counts[idx]} images")


STRATIFIED DATA SPLIT (60/20/20)

Train set: 677 images (60.0%)
Val set:   226 images (20.0%)
Test set:  226 images (20.0%)

Class distribution per split:

Train:
  Avulsion fracture: 74 images
  Comminuted fracture: 89 images
  Fracture Dislocation: 93 images
  Greenstick fracture: 73 images
  Hairline Fracture: 67 images
  Impacted fracture: 50 images
  Longitudinal fracture: 48 images
  Oblique fracture: 51 images
  Pathological fracture: 80 images
  Spiral Fracture: 52 images

Val:
  Avulsion fracture: 25 images
  Comminuted fracture: 30 images
  Fracture Dislocation: 31 images
  Greenstick fracture: 24 images
  Hairline Fracture: 22 images
  Impacted fracture: 17 images
  Longitudinal fracture: 16 images
  Oblique fracture: 17 images
  Pathological fracture: 27 images
  Spiral Fracture: 17 images

Test:
  Avulsion fracture: 24 images
  Comminuted fracture: 29 images
  Fracture Dislocation: 32 images
  Greenstick fracture: 25 images
  Hairline Fracture: 22 images
  Impacted fractu

## 2. Dataset and DataLoader Setup

Create custom dataset class with configurable augmentation for different training stages.

In [5]:
class BoneFractureDataset(Dataset):
    """Custom dataset for bone fracture classification."""
    
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        
        # Load image
        image = Image.open(img_path).convert('RGB')
        
        # Apply transforms
        if self.transform:
            image = self.transform(image)
        
        return image, label


# Define transforms
# Stage 1 & 3: Basic augmentation
basic_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomRotation(degrees=5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Stage 2: Heavy augmentation
heavy_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation/Test: No augmentation
eval_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("Transforms defined:")
print("  - basic_transform: For Stage 1 & 3 (light augmentation)")
print("  - heavy_transform: For Stage 2 (aggressive augmentation)")
print("  - eval_transform: For validation/test (no augmentation)")

Transforms defined:
  - basic_transform: For Stage 1 & 3 (light augmentation)
  - heavy_transform: For Stage 2 (aggressive augmentation)
  - eval_transform: For validation/test (no augmentation)


In [8]:
# Create datasets (will create dataloaders later per stage)
train_dataset = BoneFractureDataset(train_paths, train_labels, transform=basic_transform)
val_dataset = BoneFractureDataset(val_paths, val_labels, transform=eval_transform)
test_dataset = BoneFractureDataset(test_paths, test_labels, transform=eval_transform)

print(f"\nDatasets created:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Val:   {len(val_dataset)} samples")
print(f"  Test:  {len(test_dataset)} samples")

# Create dataloaders (without num_workers to avoid multiprocessing issues)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

print(f"\nDataLoaders created with batch_size={batch_size}")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches:   {len(val_loader)}")
print(f"  Test batches:  {len(test_loader)}")


Datasets created:
  Train: 677 samples
  Val:   226 samples
  Test:  226 samples

DataLoaders created with batch_size=64
  Train batches: 11
  Val batches:   4
  Test batches:  4


## 3. Load Pretrained Backbones

Load CLIP-B (Base) and DINOv2-Small models as feature extractors.

In [9]:
# Load DINOv2 Small model
print("Loading DINOv2 Small model...")
dinov2_small = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dinov2_small.eval()
dinov2_small = dinov2_small.cuda()

# Get feature dimension
with torch.no_grad():
    dummy_input = torch.randn(1, 3, 224, 224).cuda()
    dinov2_features = dinov2_small(dummy_input)
    dinov2_dim = dinov2_features.shape[1]

print(f"✓ DINOv2 Small loaded - Feature dimension: {dinov2_dim}")
print(f"  Parameters: {sum(p.numel() for p in dinov2_small.parameters()):,}")

Loading DINOv2 Small model...


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:00<00:00, 218MB/s] 
100%|██████████| 84.2M/84.2M [00:00<00:00, 218MB/s]


✓ DINOv2 Small loaded - Feature dimension: 384
  Parameters: 22,056,576


In [None]:
# Install open_clip_torch for CLIP model
%pip install -q open_clip_torch
print("✓ open_clip_torch installed")

In [16]:
# Load CLIP Base model using open_clip
print("Loading CLIP Base model...")
import open_clip

clip_model, _, clip_preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='openai')
clip_model = clip_model.cuda()
clip_model.eval()

# Get CLIP visual encoder and feature dimension
clip_visual = clip_model.visual
with torch.no_grad():
    dummy_input = torch.randn(1, 3, 224, 224).cuda()
    clip_features = clip_visual(dummy_input)
    clip_dim = clip_features.shape[1]

print(f"✓ CLIP Base loaded - Feature dimension: {clip_dim}")
print(f"  Parameters: {sum(p.numel() for p in clip_visual.parameters()):,}")

Loading CLIP Base model...


open_clip_model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

✓ CLIP Base loaded - Feature dimension: 512
  Parameters: 86,192,640


## 4. Define Multiple Classifier Architectures

Create various classifier heads with different depths and configurations to test during Stage 1.

In [11]:
class Classifier1Layer(nn.Module):
    """Simple 1-layer linear classifier"""
    def __init__(self, input_dim, num_classes=10):
        super().__init__()
        self.fc = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        return self.fc(x)


class Classifier2Layer(nn.Module):
    """2-layer MLP with BatchNorm and Dropout"""
    def __init__(self, input_dim, num_classes=10, hidden_dim=512, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


class Classifier3Layer(nn.Module):
    """3-layer deep MLP with BatchNorm and Dropout"""
    def __init__(self, input_dim, num_classes=10, hidden_dim1=512, hidden_dim2=256, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x


class Classifier4Layer(nn.Module):
    """4-layer very deep MLP with BatchNorm and Dropout"""
    def __init__(self, input_dim, num_classes=10, hidden_dims=[512, 256, 128], dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.bn1 = nn.BatchNorm1d(hidden_dims[0])
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.bn2 = nn.BatchNorm1d(hidden_dims[1])
        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.bn3 = nn.BatchNorm1d(hidden_dims[2])
        self.fc4 = nn.Linear(hidden_dims[2], num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        return x


print("Classifier architectures defined:")
print("  1-Layer: input -> output")
print("  2-Layer: input -> 512 -> output")
print("  3-Layer: input -> 512 -> 256 -> output")
print("  4-Layer: input -> 512 -> 256 -> 128 -> output")

Classifier architectures defined:
  1-Layer: input -> output
  2-Layer: input -> 512 -> output
  3-Layer: input -> 512 -> 256 -> output
  4-Layer: input -> 512 -> 256 -> 128 -> output


## 5. Precompute Features

Extract features once from both backbones to speed up classifier training.

In [12]:
def extract_features(backbone, dataloader, backbone_name="backbone"):
    """Extract features from a frozen backbone"""
    device = torch.device('cuda')
    backbone.eval()
    
    all_features = []
    all_labels = []
    
    print(f"Extracting features from {backbone_name}...")
    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc=f"Extracting {backbone_name} features"):
            images = images.to(device)
            features = backbone(images)
            all_features.append(features.cpu())
            all_labels.append(labels)
    
    all_features = torch.cat(all_features, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    print(f"✓ Extracted {all_features.shape[0]} feature vectors of dim {all_features.shape[1]}")
    return all_features, all_labels


def train_classifier_on_features(classifier, train_features, train_labels, val_features, val_labels, 
                                 epochs=20, lr=1e-3, batch_size=64):
    """Train a classifier on precomputed features"""
    device = torch.device('cuda')
    classifier = classifier.cuda()
    
    # Create tensor datasets
    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_features, val_labels)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(classifier.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0.0
    best_epoch = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(epochs):
        # Training
        classifier.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            
            # Forward through classifier
            outputs = classifier(features)
            loss = criterion(outputs, labels)
            
            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Stats
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100. * train_correct / train_total
        
        # Validation
        classifier.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = classifier(features)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100. * val_correct / val_total
        
        # Update history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch + 1
        
        scheduler.step()
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}% (Best: {best_val_acc:.2f}% @ Epoch {best_epoch})")
    
    print(f"Final - Best Val Acc: {best_val_acc:.2f}% @ Epoch {best_epoch}")
    return classifier, best_val_acc, history


def evaluate_classifier_on_features(classifier, test_features, test_labels, batch_size=64):
    """Evaluate classifier on precomputed features"""
    device = torch.device('cuda')
    classifier.eval()
    
    test_dataset = torch.utils.data.TensorDataset(test_features, test_labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = classifier(features)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    test_acc = 100. * correct / total
    return test_acc

print("Feature extraction and training functions defined.")

Feature extraction and training functions defined.


In [17]:
# Extract features from both backbones
print("="*80)
print("EXTRACTING FEATURES FROM BOTH BACKBONES")
print("="*80)

# DINOv2 features
dinov2_train_features, dinov2_train_labels = extract_features(dinov2_small, train_loader, "DINOv2-Small")
dinov2_val_features, dinov2_val_labels = extract_features(dinov2_small, val_loader, "DINOv2-Small")
dinov2_test_features, dinov2_test_labels = extract_features(dinov2_small, test_loader, "DINOv2-Small")

print()

# CLIP features
clip_train_features, clip_train_labels = extract_features(clip_visual, train_loader, "CLIP-Base")
clip_val_features, clip_val_labels = extract_features(clip_visual, val_loader, "CLIP-Base")
clip_test_features, clip_test_labels = extract_features(clip_visual, test_loader, "CLIP-Base")

print("\n" + "="*80)
print("FEATURE EXTRACTION COMPLETE")
print("="*80)
print(f"DINOv2-Small features: {dinov2_train_features.shape}")
print(f"CLIP-Base features: {clip_train_features.shape}")

EXTRACTING FEATURES FROM BOTH BACKBONES
Extracting features from DINOv2-Small...


Extracting DINOv2-Small features:   0%|          | 0/11 [00:00<?, ?it/s]

✓ Extracted 677 feature vectors of dim 384
Extracting features from DINOv2-Small...


Extracting DINOv2-Small features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 384
Extracting features from DINOv2-Small...


Extracting DINOv2-Small features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 384

Extracting features from CLIP-Base...


Extracting CLIP-Base features:   0%|          | 0/11 [00:00<?, ?it/s]

✓ Extracted 677 feature vectors of dim 512
Extracting features from CLIP-Base...


Extracting CLIP-Base features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 512
Extracting features from CLIP-Base...


Extracting CLIP-Base features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 512

FEATURE EXTRACTION COMPLETE
DINOv2-Small features: torch.Size([677, 384])
CLIP-Base features: torch.Size([677, 512])


## 6. Stage 1: Linear Probing

Train multiple classifier heads on precomputed features. Test all 8 combinations:
- **Backbones**: DINOv2-Small, CLIP-Base  
- **Classifiers**: 1-layer, 2-layer, 3-layer, 4-layer

In [13]:
# Store results for comparison
stage1_results = []

# Define configurations to test
classifier_configs = [
    ("1-Layer", Classifier1Layer),
    ("2-Layer", Classifier2Layer),
    ("3-Layer", Classifier3Layer),
    ("4-Layer", Classifier4Layer)
]

backbone_configs = [
    ("DINOv2-Small", dinov2_small, dinov2_dim),
    ("CLIP-Base", clip_visual, clip_dim)
]

print("="*80)
print("STAGE 1: LINEAR PROBING - Training all classifier configurations")
print("="*80)
print(f"\nTotal configurations to test: {len(classifier_configs) * len(backbone_configs)}")
print(f"Training epochs per configuration: 20\n")

NameError: name 'clip_visual' is not defined

In [21]:
# Store results for comparison
stage1_results = []

# Define feature sets
feature_sets = [
    ("DINOv2-Small", dinov2_train_features, dinov2_train_labels, dinov2_val_features, dinov2_val_labels, 
     dinov2_test_features, dinov2_test_labels, dinov2_dim),
    ("CLIP-Base", clip_train_features, clip_train_labels, clip_val_features, clip_val_labels,
     clip_test_features, clip_test_labels, clip_dim)
]

print("="*80)
print("STAGE 1: LINEAR PROBING - Training all classifier configurations")
print("="*80)
print(f"\nTotal configurations to test: {len(classifier_configs) * len(feature_sets)}")
print(f"Training epochs per configuration: 20\n")

# Train all configurations
for backbone_name, train_feat, train_lbl, val_feat, val_lbl, test_feat, test_lbl, feature_dim in feature_sets:
    print(f"\n{'='*80}")
    print(f"BACKBONE: {backbone_name} (Feature dim: {feature_dim})")
    print(f"{'='*80}\n")
    
    for classifier_name, ClassifierClass in classifier_configs:
        print(f"\n--- Testing {classifier_name} Classifier ---")
        
        # Create classifier
        if classifier_name == "1-Layer":
            classifier = ClassifierClass(feature_dim, num_classes=10)
        elif classifier_name == "2-Layer":
            classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dim=512, dropout=0.3)
        elif classifier_name == "3-Layer":
            classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dim1=512, hidden_dim2=256, dropout=0.3)
        else:  # 4-Layer
            classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dims=[512, 256, 128], dropout=0.3)
        
        # Train on precomputed features
        trained_classifier, best_val_acc, history = train_classifier_on_features(
            classifier=classifier,
            train_features=train_feat,
            train_labels=train_lbl,
            val_features=val_feat,
            val_labels=val_lbl,
            epochs=20,
            lr=1e-3,
            batch_size=64
        )
        
        # Evaluate on test set
        test_acc = evaluate_classifier_on_features(trained_classifier, test_feat, test_lbl, batch_size=64)
        
        # Store results
        result = {
            'backbone': backbone_name,
            'classifier': classifier_name,
            'best_val_acc': best_val_acc,
            'test_acc': test_acc,
            'history': history,
            'model': trained_classifier
        }
        stage1_results.append(result)
        
        print(f"\n✓ {backbone_name} + {classifier_name}:")
        print(f"  Best Val Acc: {best_val_acc:.2f}%")
        print(f"  Test Acc: {test_acc:.2f}%")
        print("-" * 80)

print(f"\n{'='*80}")
print("STAGE 1 COMPLETE - All configurations trained")
print(f"{'='*80}")

STAGE 1: LINEAR PROBING - Training all classifier configurations

Total configurations to test: 8
Training epochs per configuration: 20


BACKBONE: DINOv2-Small (Feature dim: 384)


--- Testing 1-Layer Classifier ---
Epoch 1/20 - Train Loss: 2.5283, Train Acc: 14.48%, Val Loss: 2.3125, Val Acc: 19.91% (Best: 19.91% @ Epoch 1)
Epoch 5/20 - Train Loss: 1.8312, Train Acc: 38.11%, Val Loss: 2.0671, Val Acc: 30.09% (Best: 30.09% @ Epoch 5)
Epoch 10/20 - Train Loss: 1.5662, Train Acc: 49.34%, Val Loss: 1.9818, Val Acc: 33.19% (Best: 33.19% @ Epoch 10)
Epoch 15/20 - Train Loss: 1.4532, Train Acc: 54.51%, Val Loss: 1.9617, Val Acc: 36.28% (Best: 36.28% @ Epoch 15)
Epoch 20/20 - Train Loss: 1.4285, Train Acc: 55.54%, Val Loss: 1.9619, Val Acc: 33.63% (Best: 36.28% @ Epoch 15)
Final - Best Val Acc: 36.28% @ Epoch 15

✓ DINOv2-Small + 1-Layer:
  Best Val Acc: 36.28%
  Test Acc: 35.40%
--------------------------------------------------------------------------------

--- Testing 2-Layer Classifier 

In [22]:
# Display results summary
print("\n" + "="*80)
print("STAGE 1 RESULTS SUMMARY")
print("="*80)

results_df = pd.DataFrame([
    {
        'Backbone': r['backbone'],
        'Classifier': r['classifier'],
        'Best Val Acc (%)': f"{r['best_val_acc']:.2f}",
        'Test Acc (%)': f"{r['test_acc']:.2f}"
    }
    for r in stage1_results
])

print("\n", results_df.to_string(index=False))

# Find best configuration
best_result = max(stage1_results, key=lambda x: x['test_acc'])
print(f"\n{'='*80}")
print(f"BEST CONFIGURATION:")
print(f"  {best_result['backbone']} + {best_result['classifier']}")
print(f"  Val Acc: {best_result['best_val_acc']:.2f}%")
print(f"  Test Acc: {best_result['test_acc']:.2f}%")
print(f"{'='*80}")


STAGE 1 RESULTS SUMMARY

     Backbone Classifier Best Val Acc (%) Test Acc (%)
DINOv2-Small    1-Layer            36.28        35.40
DINOv2-Small    2-Layer            41.59        44.69
DINOv2-Small    3-Layer            42.04        43.81
DINOv2-Small    4-Layer            44.25        42.92
   CLIP-Base    1-Layer            30.53        27.88
   CLIP-Base    2-Layer            34.96        34.51
   CLIP-Base    3-Layer            35.40        34.07
   CLIP-Base    4-Layer            37.61        30.97

BEST CONFIGURATION:
  DINOv2-Small + 2-Layer
  Val Acc: 41.59%
  Test Acc: 44.69%


## 6.1 Retrain Best Configurations with Higher Epochs

Retrain top performers with 50 epochs for better convergence.

In [23]:
# Retrain best configurations with 50 epochs
print("="*80)
print("RETRAINING BEST CONFIGURATIONS WITH 50 EPOCHS")
print("="*80)

stage1_extended_results = []
epochs_extended = 50

# Select top 4 configurations from stage1_results
top_configs = sorted(stage1_results, key=lambda x: x['test_acc'], reverse=True)[:4]

for config in top_configs:
    backbone_name = config['backbone']
    classifier_name = config['classifier']
    
    print(f"\n{'='*80}")
    print(f"Training: {backbone_name} + {classifier_name} (50 epochs)")
    print(f"{'='*80}")
    
    # Get appropriate features
    if backbone_name == "DINOv2-Small":
        train_feat, train_lbl = dinov2_train_features, dinov2_train_labels
        val_feat, val_lbl = dinov2_val_features, dinov2_val_labels
        test_feat, test_lbl = dinov2_test_features, dinov2_test_labels
        feature_dim = dinov2_dim
    else:
        train_feat, train_lbl = clip_train_features, clip_train_labels
        val_feat, val_lbl = clip_val_features, clip_val_labels
        test_feat, test_lbl = clip_test_features, clip_test_labels
        feature_dim = clip_dim
    
    # Create fresh classifier
    ClassifierClass = dict(classifier_configs)[classifier_name]
    if classifier_name == "1-Layer":
        classifier = ClassifierClass(feature_dim, num_classes=10)
    elif classifier_name == "2-Layer":
        classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dim=512, dropout=0.3)
    elif classifier_name == "3-Layer":
        classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dim1=512, hidden_dim2=256, dropout=0.3)
    else:  # 4-Layer
        classifier = ClassifierClass(feature_dim, num_classes=10, hidden_dims=[512, 256, 128], dropout=0.3)
    
    # Train
    trained_classifier, best_val_acc, history = train_classifier_on_features(
        classifier=classifier,
        train_features=train_feat,
        train_labels=train_lbl,
        val_features=val_feat,
        val_labels=val_lbl,
        epochs=epochs_extended,
        lr=1e-3,
        batch_size=64
    )
    
    # Evaluate
    test_acc = evaluate_classifier_on_features(trained_classifier, test_feat, test_lbl, batch_size=64)
    
    result = {
        'backbone': backbone_name,
        'classifier': classifier_name,
        'best_val_acc': best_val_acc,
        'test_acc': test_acc,
        'history': history,
        'model': trained_classifier
    }
    stage1_extended_results.append(result)
    
    print(f"\n✓ {backbone_name} + {classifier_name} (50 epochs):")
    print(f"  Best Val Acc: {best_val_acc:.2f}%")
    print(f"  Test Acc: {test_acc:.2f}%")
    print("-" * 80)

print(f"\n{'='*80}")
print("EXTENDED TRAINING COMPLETE")
print(f"{'='*80}")

RETRAINING BEST CONFIGURATIONS WITH 50 EPOCHS

Training: DINOv2-Small + 2-Layer (50 epochs)
Epoch 1/50 - Train Loss: 2.2280, Train Acc: 17.28%, Val Loss: 2.1240, Val Acc: 28.76% (Best: 28.76% @ Epoch 1)
Epoch 5/50 - Train Loss: 0.9185, Train Acc: 77.99%, Val Loss: 1.8368, Val Acc: 37.61% (Best: 38.05% @ Epoch 2)
Epoch 10/50 - Train Loss: 0.3731, Train Acc: 96.16%, Val Loss: 1.8562, Val Acc: 39.82% (Best: 40.71% @ Epoch 7)
Epoch 15/50 - Train Loss: 0.1661, Train Acc: 99.70%, Val Loss: 1.9572, Val Acc: 41.59% (Best: 42.92% @ Epoch 12)
Epoch 20/50 - Train Loss: 0.0752, Train Acc: 100.00%, Val Loss: 2.0497, Val Acc: 41.15% (Best: 42.92% @ Epoch 12)
Epoch 25/50 - Train Loss: 0.0562, Train Acc: 99.85%, Val Loss: 2.1071, Val Acc: 39.82% (Best: 42.92% @ Epoch 12)
Epoch 30/50 - Train Loss: 0.0410, Train Acc: 100.00%, Val Loss: 2.1572, Val Acc: 42.48% (Best: 42.92% @ Epoch 12)
Epoch 35/50 - Train Loss: 0.0315, Train Acc: 100.00%, Val Loss: 2.1751, Val Acc: 42.04% (Best: 42.92% @ Epoch 12)
Epoch 

In [24]:
# Compare 20 vs 50 epochs
print("\n" + "="*80)
print("COMPARISON: 20 EPOCHS vs 50 EPOCHS")
print("="*80)

comparison_data = []
for orig in top_configs:
    extended = [r for r in stage1_extended_results if r['backbone'] == orig['backbone'] and r['classifier'] == orig['classifier']][0]
    comparison_data.append({
        'Configuration': f"{orig['backbone']} + {orig['classifier']}",
        '20 Epochs Val': f"{orig['best_val_acc']:.2f}%",
        '20 Epochs Test': f"{orig['test_acc']:.2f}%",
        '50 Epochs Val': f"{extended['best_val_acc']:.2f}%",
        '50 Epochs Test': f"{extended['test_acc']:.2f}%",
        'Improvement': f"{extended['test_acc'] - orig['test_acc']:+.2f}%"
    })

comparison_df = pd.DataFrame(comparison_data)
print("\n", comparison_df.to_string(index=False))

# Find overall best
best_extended = max(stage1_extended_results, key=lambda x: x['test_acc'])
print(f"\n{'='*80}")
print(f"BEST MODEL (50 epochs):")
print(f"  {best_extended['backbone']} + {best_extended['classifier']}")
print(f"  Val Acc: {best_extended['best_val_acc']:.2f}%")
print(f"  Test Acc: {best_extended['test_acc']:.2f}%")
print(f"{'='*80}")


COMPARISON: 20 EPOCHS vs 50 EPOCHS

          Configuration 20 Epochs Val 20 Epochs Test 50 Epochs Val 50 Epochs Test Improvement
DINOv2-Small + 2-Layer        41.59%         44.69%        42.92%         44.69%      +0.00%
DINOv2-Small + 3-Layer        42.04%         43.81%        45.58%         45.58%      +1.77%
DINOv2-Small + 4-Layer        44.25%         42.92%        41.59%         42.92%      +0.00%
DINOv2-Small + 1-Layer        36.28%         35.40%        35.84%         41.59%      +6.19%

BEST MODEL (50 epochs):
  DINOv2-Small + 3-Layer
  Val Acc: 45.58%
  Test Acc: 45.58%


## 7. Stage 2: Partial Backbone Training with Frozen Classifier

Take the best Stage 1 model, freeze its classifier, and train the backbone partially using:
- **Strategy A**: Train last N transformer blocks
- **Strategy B**: LoRA (Low-Rank Adaptation)
- **Strategy C**: Train top + bottom blocks (middle frozen)

In [32]:
# Stage 2 training functions
def get_trainable_params(model):
    """Count trainable parameters"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def freeze_all(model):
    """Freeze all parameters"""
    for param in model.parameters():
        param.requires_grad = False

def unfreeze_last_n_blocks(backbone, n_blocks=4):
    """Unfreeze last N transformer blocks (supports both DINOv2 and CLIP)"""
    freeze_all(backbone)
    
    # Check model type
    if hasattr(backbone, 'blocks'):
        # DINOv2 structure: backbone.blocks
        total_blocks = len(backbone.blocks)
        for i in range(total_blocks - n_blocks, total_blocks):
            for param in backbone.blocks[i].parameters():
                param.requires_grad = True
        print(f"[DINOv2] Unfroze last {n_blocks} blocks out of {total_blocks}")
    elif hasattr(backbone, 'transformer') and hasattr(backbone.transformer, 'resblocks'):
        # CLIP structure: backbone.transformer.resblocks
        total_blocks = len(backbone.transformer.resblocks)
        for i in range(total_blocks - n_blocks, total_blocks):
            for param in backbone.transformer.resblocks[i].parameters():
                param.requires_grad = True
        print(f"[CLIP] Unfroze last {n_blocks} blocks out of {total_blocks}")
    else:
        raise ValueError("Unsupported backbone architecture")
    
    return backbone

def unfreeze_top_bottom_blocks(backbone, top_n=2, bottom_n=2):
    """Unfreeze first N and last N blocks, keep middle frozen (supports both DINOv2 and CLIP)"""
    freeze_all(backbone)
    
    # Check model type
    if hasattr(backbone, 'blocks'):
        # DINOv2 structure
        total_blocks = len(backbone.blocks)
        # Unfreeze first bottom_n blocks
        for i in range(bottom_n):
            for param in backbone.blocks[i].parameters():
                param.requires_grad = True
        # Unfreeze last top_n blocks
        for i in range(total_blocks - top_n, total_blocks):
            for param in backbone.blocks[i].parameters():
                param.requires_grad = True
        print(f"[DINOv2] Unfroze first {bottom_n} and last {top_n} blocks (middle {total_blocks - top_n - bottom_n} blocks frozen)")
    elif hasattr(backbone, 'transformer') and hasattr(backbone.transformer, 'resblocks'):
        # CLIP structure
        total_blocks = len(backbone.transformer.resblocks)
        # Unfreeze first bottom_n blocks
        for i in range(bottom_n):
            for param in backbone.transformer.resblocks[i].parameters():
                param.requires_grad = True
        # Unfreeze last top_n blocks
        for i in range(total_blocks - top_n, total_blocks):
            for param in backbone.transformer.resblocks[i].parameters():
                param.requires_grad = True
        print(f"[CLIP] Unfroze first {bottom_n} and last {top_n} blocks (middle {total_blocks - top_n - bottom_n} blocks frozen)")
    else:
        raise ValueError("Unsupported backbone architecture")
    
    return backbone

print("Stage 2 helper functions defined.")

Stage 2 helper functions defined.


In [26]:
def train_stage2(backbone, classifier, train_loader, val_loader, test_loader, 
                 epochs=30, lr=1e-5, strategy_name="Stage2"):
    """Train backbone with frozen classifier"""
    device = torch.device('cuda')
    
    # Freeze classifier
    classifier.eval()
    for param in classifier.parameters():
        param.requires_grad = False
    
    # Move to GPU
    backbone = backbone.cuda()
    classifier = classifier.cuda()
    
    # Only optimize backbone parameters
    trainable_params = [p for p in backbone.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(trainable_params, lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0.0
    best_epoch = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    print(f"Trainable params: {get_trainable_params(backbone):,}")
    print(f"Training for {epochs} epochs with lr={lr}\n")
    
    for epoch in range(epochs):
        # Training
        backbone.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Forward through backbone + classifier
            features = backbone(images)
            with torch.no_grad():
                outputs = classifier(features)
            
            # Actually need to compute gradients for classifier output
            features = backbone(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)
            
            # Backward (only updates backbone)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(trainable_params, max_norm=1.0)
            optimizer.step()
            
            # Stats
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100. * train_correct / train_total
        
        # Validation
        backbone.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                features = backbone(images)
                outputs = classifier(features)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100. * val_correct / val_total
        
        # Update history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = epoch + 1
        
        scheduler.step()
        
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train: {train_acc:.2f}%, Val: {val_acc:.2f}% "
                  f"(Best: {best_val_acc:.2f}% @ Epoch {best_epoch})")
    
    # Test evaluation
    backbone.eval()
    test_correct = 0
    test_total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            features = backbone(images)
            outputs = classifier(features)
            _, predicted = outputs.max(1)
            test_total += labels.size(0)
            test_correct += predicted.eq(labels).sum().item()
    
    test_acc = 100. * test_correct / test_total
    
    print(f"\nFinal - Best Val: {best_val_acc:.2f}%, Test: {test_acc:.2f}%")
    
    return backbone, best_val_acc, test_acc, history

print("Stage 2 training function defined.")

Stage 2 training function defined.


In [27]:
# Prepare for Stage 2: Get best model and recreate dataloaders with heavy augmentation
print("="*80)
print("PREPARING FOR STAGE 2")
print("="*80)

# Get best extended model
best_model = max(stage1_extended_results, key=lambda x: x['test_acc'])
print(f"\nBest Stage 1 model: {best_model['backbone']} + {best_model['classifier']}")
print(f"  Val Acc: {best_model['best_val_acc']:.2f}%")
print(f"  Test Acc: {best_model['test_acc']:.2f}%")

# We'll use DINOv2 for Stage 2 (better performance)
best_backbone = dinov2_small
best_classifier = best_model['model']

# Create dataloaders with heavy augmentation for Stage 2
train_dataset_heavy = BoneFractureDataset(train_paths, train_labels, transform=heavy_transform)
train_loader_heavy = DataLoader(train_dataset_heavy, batch_size=32, shuffle=True, pin_memory=True)

print(f"\n✓ Prepared for Stage 2:")
print(f"  Backbone: DINOv2-Small")
print(f"  Classifier: {best_model['classifier']} (frozen)")
print(f"  Training data: Heavy augmentation enabled")
print(f"  Batch size: 32")

PREPARING FOR STAGE 2

Best Stage 1 model: DINOv2-Small + 3-Layer
  Val Acc: 45.58%
  Test Acc: 45.58%

✓ Prepared for Stage 2:
  Backbone: DINOv2-Small
  Classifier: 3-Layer (frozen)
  Training data: Heavy augmentation enabled
  Batch size: 32


In [28]:
# Stage 2: Train with different partial training strategies
print("\n" + "="*80)
print("STAGE 2: PARTIAL BACKBONE TRAINING")
print("="*80)

stage2_results = []

# Strategy A: Train last 4 blocks
print(f"\n{'='*80}")
print("STRATEGY A: Train Last 4 Transformer Blocks")
print(f"{'='*80}")

import copy
backbone_strategy_a = copy.deepcopy(best_backbone)
classifier_strategy_a = copy.deepcopy(best_classifier)

backbone_strategy_a = unfreeze_last_n_blocks(backbone_strategy_a, n_blocks=4)

trained_backbone_a, val_acc_a, test_acc_a, history_a = train_stage2(
    backbone=backbone_strategy_a,
    classifier=classifier_strategy_a,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="Strategy A"
)

stage2_results.append({
    'strategy': 'Last 4 Blocks',
    'val_acc': val_acc_a,
    'test_acc': test_acc_a,
    'history': history_a,
    'backbone': trained_backbone_a,
    'classifier': classifier_strategy_a
})

print(f"\n✓ Strategy A Complete - Test Acc: {test_acc_a:.2f}%")


# Strategy B: Train last 6 blocks (more aggressive)
print(f"\n{'='*80}")
print("STRATEGY B: Train Last 6 Transformer Blocks")
print(f"{'='*80}")

backbone_strategy_b = copy.deepcopy(best_backbone)
classifier_strategy_b = copy.deepcopy(best_classifier)

backbone_strategy_b = unfreeze_last_n_blocks(backbone_strategy_b, n_blocks=6)

trained_backbone_b, val_acc_b, test_acc_b, history_b = train_stage2(
    backbone=backbone_strategy_b,
    classifier=classifier_strategy_b,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="Strategy B"
)

stage2_results.append({
    'strategy': 'Last 6 Blocks',
    'val_acc': val_acc_b,
    'test_acc': test_acc_b,
    'history': history_b,
    'backbone': trained_backbone_b,
    'classifier': classifier_strategy_b
})

print(f"\n✓ Strategy B Complete - Test Acc: {test_acc_b:.2f}%")


# Strategy C: Train top 2 + bottom 2 blocks
print(f"\n{'='*80}")
print("STRATEGY C: Train Top 2 + Bottom 2 Blocks (Middle Frozen)")
print(f"{'='*80}")

backbone_strategy_c = copy.deepcopy(best_backbone)
classifier_strategy_c = copy.deepcopy(best_classifier)

backbone_strategy_c = unfreeze_top_bottom_blocks(backbone_strategy_c, top_n=2, bottom_n=2)

trained_backbone_c, val_acc_c, test_acc_c, history_c = train_stage2(
    backbone=backbone_strategy_c,
    classifier=classifier_strategy_c,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="Strategy C"
)

stage2_results.append({
    'strategy': 'Top 2 + Bottom 2 Blocks',
    'val_acc': val_acc_c,
    'test_acc': test_acc_c,
    'history': history_c,
    'backbone': trained_backbone_c,
    'classifier': classifier_strategy_c
})

print(f"\n✓ Strategy C Complete - Test Acc: {test_acc_c:.2f}%")

print(f"\n{'='*80}")
print("STAGE 2 COMPLETE")
print(f"{'='*80}")


STAGE 2: PARTIAL BACKBONE TRAINING

STRATEGY A: Train Last 4 Transformer Blocks
Unfroze last 4 blocks out of 12
Trainable params: 7,100,928
Training for 30 epochs with lr=5e-06

Epoch 1/30 - Train: 76.51%, Val: 40.27% (Best: 40.27% @ Epoch 1)
Epoch 10/30 - Train: 92.02%, Val: 47.79% (Best: 47.79% @ Epoch 10)
Epoch 20/30 - Train: 96.75%, Val: 49.56% (Best: 51.33% @ Epoch 19)
Epoch 30/30 - Train: 97.64%, Val: 50.88% (Best: 51.77% @ Epoch 25)

Final - Best Val: 51.77%, Test: 51.77%

✓ Strategy A Complete - Test Acc: 51.77%

STRATEGY B: Train Last 6 Transformer Blocks
Unfroze last 6 blocks out of 12
Trainable params: 10,651,392
Training for 30 epochs with lr=5e-06

Epoch 1/30 - Train: 72.82%, Val: 45.58% (Best: 45.58% @ Epoch 1)
Epoch 10/30 - Train: 95.72%, Val: 50.44% (Best: 50.44% @ Epoch 10)
Epoch 20/30 - Train: 98.23%, Val: 47.79% (Best: 50.44% @ Epoch 10)
Epoch 30/30 - Train: 98.82%, Val: 49.56% (Best: 50.88% @ Epoch 26)

Final - Best Val: 50.88%, Test: 54.42%

✓ Strategy B Complete 

In [None]:
# Stage 2 Results Summary
print("\n" + "="*80)
print("STAGE 2 RESULTS SUMMARY")
print("="*80)

stage2_df = pd.DataFrame([
    {
        'Strategy': r['strategy'],
        'Best Val Acc (%)': f"{r['val_acc']:.2f}",
        'Test Acc (%)': f"{r['test_acc']:.2f}"
    }
    for r in stage2_results
])

print("\n", stage2_df.to_string(index=False))

# Compare with Stage 1 best
print(f"\n{'='*80}")
print("COMPARISON WITH STAGE 1")
print(f"{'='*80}")
print(f"Stage 1 Best (50 epochs): {best_model['test_acc']:.2f}%")
print(f"\nStage 2 Results:")
for r in stage2_results:
    improvement = r['test_acc'] - best_model['test_acc']
    print(f"  {r['strategy']}: {r['test_acc']:.2f}% ({improvement:+.2f}%)")

best_stage2 = max(stage2_results, key=lambda x: x['test_acc'])
print(f"\n{'='*80}")
print(f"BEST STAGE 2 MODEL:")
print(f"  Strategy: {best_stage2['strategy']}")
print(f"  Val Acc: {best_stage2['val_acc']:.2f}%")
print(f"  Test Acc: {best_stage2['test_acc']:.2f}%")
print(f"{'='*80}")

## Stage 2B: CLIP Partial Backbone Training

Apply the same partial training strategies to the best CLIP model.

In [30]:
# Get best CLIP model from extended training (or original if not extended)
clip_extended = [r for r in stage1_extended_results if r['backbone'] == 'CLIP-Base']
if clip_extended:
    best_clip_model = max(clip_extended, key=lambda x: x['test_acc'])
else:
    # Use original stage1 results
    best_clip_model = max([r for r in stage1_results if r['backbone'] == 'CLIP-Base'], 
                          key=lambda x: x['test_acc'])

print("="*80)
print("BEST CLIP MODEL FROM STAGE 1")
print("="*80)
print(f"Configuration: CLIP-Base + {best_clip_model['classifier']}")
print(f"Val Acc: {best_clip_model['best_val_acc']:.2f}%")
print(f"Test Acc: {best_clip_model['test_acc']:.2f}%")
print("="*80)

# Prepare CLIP backbone and classifier
best_clip_backbone = clip_visual
best_clip_classifier = best_clip_model['model']

print("\n✓ CLIP model prepared for Stage 2 training")

BEST CLIP MODEL FROM STAGE 1
Configuration: CLIP-Base + 2-Layer
Val Acc: 34.96%
Test Acc: 34.51%

✓ CLIP model prepared for Stage 2 training


In [None]:
# Stage 2B: Train CLIP with different partial training strategies
print("\n" + "="*80)
print("STAGE 2B: CLIP PARTIAL BACKBONE TRAINING")
print("="*80)

stage2_clip_results = []

# CLIP Strategy A: Train last 4 blocks
print(f"\n{'='*80}")
print("CLIP STRATEGY A: Train Last 4 Transformer Blocks")
print(f"{'='*80}")

import copy
clip_backbone_strategy_a = copy.deepcopy(best_clip_backbone)
clip_classifier_strategy_a = copy.deepcopy(best_clip_classifier)

clip_backbone_strategy_a = unfreeze_last_n_blocks(clip_backbone_strategy_a, n_blocks=4)

trained_clip_backbone_a, clip_val_acc_a, clip_test_acc_a, clip_history_a = train_stage2(
    backbone=clip_backbone_strategy_a,
    classifier=clip_classifier_strategy_a,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="CLIP Strategy A"
)

stage2_clip_results.append({
    'strategy': 'Last 4 Blocks',
    'val_acc': clip_val_acc_a,
    'test_acc': clip_test_acc_a,
    'history': clip_history_a,
    'backbone': trained_clip_backbone_a,
    'classifier': clip_classifier_strategy_a
})

print(f"\n✓ CLIP Strategy A Complete - Test Acc: {clip_test_acc_a:.2f}%")


# CLIP Strategy B: Train last 6 blocks (more aggressive)
print(f"\n{'='*80}")
print("CLIP STRATEGY B: Train Last 6 Transformer Blocks")
print(f"{'='*80}")

clip_backbone_strategy_b = copy.deepcopy(best_clip_backbone)
clip_classifier_strategy_b = copy.deepcopy(best_clip_classifier)

clip_backbone_strategy_b = unfreeze_last_n_blocks(clip_backbone_strategy_b, n_blocks=6)

trained_clip_backbone_b, clip_val_acc_b, clip_test_acc_b, clip_history_b = train_stage2(
    backbone=clip_backbone_strategy_b,
    classifier=clip_classifier_strategy_b,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="CLIP Strategy B"
)

stage2_clip_results.append({
    'strategy': 'Last 6 Blocks',
    'val_acc': clip_val_acc_b,
    'test_acc': clip_test_acc_b,
    'history': clip_history_b,
    'backbone': trained_clip_backbone_b,
    'classifier': clip_classifier_strategy_b
})

print(f"\n✓ CLIP Strategy B Complete - Test Acc: {clip_test_acc_b:.2f}%")


# CLIP Strategy C: Train top 2 + bottom 2 blocks
print(f"\n{'='*80}")
print("CLIP STRATEGY C: Train Top 2 + Bottom 2 Blocks (Middle Frozen)")
print(f"{'='*80}")

clip_backbone_strategy_c = copy.deepcopy(best_clip_backbone)
clip_classifier_strategy_c = copy.deepcopy(best_clip_classifier)

clip_backbone_strategy_c = unfreeze_top_bottom_blocks(clip_backbone_strategy_c, top_n=2, bottom_n=2)

trained_clip_backbone_c, clip_val_acc_c, clip_test_acc_c, clip_history_c = train_stage2(
    backbone=clip_backbone_strategy_c,
    classifier=clip_classifier_strategy_c,
    train_loader=train_loader_heavy,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=30,
    lr=5e-6,
    strategy_name="CLIP Strategy C"
)

stage2_clip_results.append({
    'strategy': 'Top 2 + Bottom 2 Blocks',
    'val_acc': clip_val_acc_c,
    'test_acc': clip_test_acc_c,
    'history': clip_history_c,
    'backbone': trained_clip_backbone_c,
    'classifier': clip_classifier_strategy_c
})

print(f"\n✓ CLIP Strategy C Complete - Test Acc: {clip_test_acc_c:.2f}%")

print(f"\n{'='*80}")
print("STAGE 2B COMPLETE (CLIP)")
print(f"{'='*80}")


STAGE 2B: CLIP PARTIAL BACKBONE TRAINING

CLIP STRATEGY A: Train Last 4 Transformer Blocks
[CLIP] Unfroze last 4 blocks out of 12
Trainable params: 28,351,488
Training for 30 epochs with lr=5e-06

Epoch 1/30 - Train: 50.66%, Val: 38.50% (Best: 38.50% @ Epoch 1)
Epoch 10/30 - Train: 79.62%, Val: 43.36% (Best: 47.79% @ Epoch 7)
Epoch 20/30 - Train: 90.25%, Val: 50.44% (Best: 50.44% @ Epoch 20)
Epoch 30/30 - Train: 90.69%, Val: 46.90% (Best: 50.44% @ Epoch 20)

Final - Best Val: 50.44%, Test: 42.04%

✓ CLIP Strategy A Complete - Test Acc: 42.04%

CLIP STRATEGY B: Train Last 6 Transformer Blocks
[CLIP] Unfroze last 6 blocks out of 12
Trainable params: 42,527,232
Training for 30 epochs with lr=5e-06

Epoch 1/30 - Train: 48.15%, Val: 37.61% (Best: 37.61% @ Epoch 1)
Epoch 10/30 - Train: 89.51%, Val: 50.00% (Best: 50.00% @ Epoch 10)
Epoch 20/30 - Train: 98.38%, Val: 51.33% (Best: 51.77% @ Epoch 13)


In [None]:
# Stage 2B CLIP Results Summary
print("\n" + "="*80)
print("STAGE 2B CLIP RESULTS SUMMARY")
print("="*80)

stage2_clip_df = pd.DataFrame([
    {
        'Strategy': r['strategy'],
        'Best Val Acc (%)': f"{r['val_acc']:.2f}",
        'Test Acc (%)': f"{r['test_acc']:.2f}"
    }
    for r in stage2_clip_results
])

print("\n", stage2_clip_df.to_string(index=False))

# Compare with Stage 1 best CLIP
print(f"\n{'='*80}")
print("COMPARISON WITH STAGE 1 CLIP")
print(f"{'='*80}")
print(f"Stage 1 Best CLIP (50 epochs): {best_clip_model['test_acc']:.2f}%")
print(f"\nStage 2B CLIP Results:")
for r in stage2_clip_results:
    improvement = r['test_acc'] - best_clip_model['test_acc']
    print(f"  {r['strategy']}: {r['test_acc']:.2f}% ({improvement:+.2f}%)")

best_stage2_clip = max(stage2_clip_results, key=lambda x: x['test_acc'])
print(f"\n{'='*80}")
print(f"BEST STAGE 2B CLIP MODEL:")
print(f"  Strategy: {best_stage2_clip['strategy']}")
print(f"  Val Acc: {best_stage2_clip['val_acc']:.2f}%")
print(f"  Test Acc: {best_stage2_clip['test_acc']:.2f}%")
print(f"{'='*80}")

In [None]:
# Combined Stage 2 Results: DINOv2 vs CLIP
print("\n" + "="*80)
print("STAGE 2 COMPLETE: DINOv2 vs CLIP COMPARISON")
print("="*80)

combined_stage2 = []

# DINOv2 results
for r in stage2_results:
    combined_stage2.append({
        'Backbone': 'DINOv2-Small',
        'Strategy': r['strategy'],
        'Val Acc (%)': f"{r['val_acc']:.2f}",
        'Test Acc (%)': f"{r['test_acc']:.2f}"
    })

# CLIP results
for r in stage2_clip_results:
    combined_stage2.append({
        'Backbone': 'CLIP-Base',
        'Strategy': r['strategy'],
        'Val Acc (%)': f"{r['val_acc']:.2f}",
        'Test Acc (%)': f"{r['test_acc']:.2f}"
    })

combined_df = pd.DataFrame(combined_stage2)
print("\n", combined_df.to_string(index=False))

# Overall best
all_stage2_results = stage2_results + stage2_clip_results
overall_best = max(all_stage2_results, key=lambda x: x['test_acc'])
best_backbone_name = 'DINOv2-Small' if overall_best in stage2_results else 'CLIP-Base'

print(f"\n{'='*80}")
print(f"OVERALL BEST STAGE 2 MODEL:")
print(f"  Backbone: {best_backbone_name}")
print(f"  Strategy: {overall_best['strategy']}")
print(f"  Val Acc: {overall_best['val_acc']:.2f}%")
print(f"  Test Acc: {overall_best['test_acc']:.2f}%")
print(f"{'='*80}")

In [None]:
# --- CLIP: Retrain classifiers with higher epochs (Stage 1, extended) ---
print("\n" + "#"*80)
print("RETRAINING CLIP CLASSIFIERS WITH HIGHER EPOCHS (50)")
print("#"*80 + "\n")

clip_stage1_results = []
epochs_extended = 50

for classifier_name, ClassifierClass in classifier_configs:
    print(f"\n--- CLIP: Testing {classifier_name} Classifier ---")

    # instantiate
    if classifier_name == "1-Layer":
        classifier = ClassifierClass(clip_dim, num_classes=10)
    elif classifier_name == "2-Layer":
        classifier = ClassifierClass(clip_dim, num_classes=10, hidden_dim=512, dropout=0.3)
    elif classifier_name == "3-Layer":
        classifier = ClassifierClass(clip_dim, num_classes=10, hidden_dim1=512, hidden_dim2=256, dropout=0.3)
    else:
        classifier = ClassifierClass(clip_dim, num_classes=10, hidden_dims=[512,256,128], dropout=0.3)

    trained_clf, best_val_acc, history = train_classifier_on_features(
        classifier=classifier,
        train_features=clip_train_features,
        train_labels=clip_train_labels,
        val_features=clip_val_features,
        val_labels=clip_val_labels,
        epochs=epochs_extended,
        lr=1e-3,
        batch_size=64
    )

    test_acc = evaluate_classifier_on_features(trained_clf, clip_test_features, clip_test_labels, batch_size=64)

    clip_stage1_results.append({
        'classifier': classifier_name,
        'model': trained_clf,
        'best_val_acc': best_val_acc,
        'test_acc': test_acc,
        'history': history
    })

    print(f"\n✓ CLIP + {classifier_name}: Best Val {best_val_acc:.2f}%, Test {test_acc:.2f}%")

# pick best CLIP classifier by val acc
best_clip = max(clip_stage1_results, key=lambda x: x['best_val_acc'])
print("\nBest CLIP classifier:", best_clip['classifier'], f"Val {best_clip['best_val_acc']:.2f}% Test {best_clip['test_acc']:.2f}%")

# save to a variable for Stage 2
best_clip_classifier = best_clip['model']

# --- Stage 2: Partial backbone training for CLIP with frozen classifier ---
print("\n" + "#"*80)
print("STAGE 2 (CLIP): Freeze classifier; unfreeze last N visual blocks and train backbone")
print("#"*80 + "\n")

# helper to unfreeze last N blocks of CLIP visual transformer
def unfreeze_clip_last_n(clip_visual, n=2):
    # clip_visual typically has attribute 'transformer.resblocks' or 'blocks'
    if hasattr(clip_visual, 'transformer') and hasattr(clip_visual.transformer, 'resblocks'):
        blocks = clip_visual.transformer.resblocks
    elif hasattr(clip_visual, 'blocks'):
        blocks = clip_visual.blocks
    else:
        raise RuntimeError('Unexpected CLIP visual structure: cannot find transformer blocks')

    # freeze all first
    for p in clip_visual.parameters():
        p.requires_grad = False

    # unfreeze last n blocks
    for block in list(blocks)[-n:]:
        for p in block.parameters():
            p.requires_grad = True

    # also unfreeze projection head if present
    if hasattr(clip_visual, 'proj'):
         if isinstance(clip_visual.proj, nn.Parameter):
            clip_visual.proj.requires_grad = True
        else:
            for p in clip_visual.proj.parameters():
                p.requires_grad = True
    if hasattr(clip_visual, 'ln_post'):
        for p in clip_visual.ln_post.parameters():
            p.requires_grad = True

    print(f"Unfroze last {n} blocks of CLIP visual encoder")


def train_clip_backbone_with_frozen_classifier(clip_visual, classifier, train_loader, val_loader, 
                                               epochs=10, lr=1e-4, unfreeze_last=2):
    device = torch.device('cuda')
    classifier = classifier.cuda()
    clip_visual = clip_visual.cuda()

    # freeze classifier
    for p in classifier.parameters():
        p.requires_grad = False

    # unfreeze last N
    unfreeze_clip_last_n(clip_visual, n=unfreeze_last)

    # collect trainable params (from clip_visual)
    trainable_params = [p for p in clip_visual.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(trainable_params, lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()

    best_val = 0.0
    best_model_state = None

    for epoch in range(epochs):
        clip_visual.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for images, labels in tqdm(train_loader, desc=f"Stage2 Train Epoch {epoch+1}/{epochs}"):
            images, labels = images.to(device), labels.to(device)
            features = clip_visual(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = outputs.max(1)
            train_total += labels.size(0)
            train_correct += preds.eq(labels).sum().item()

        train_acc = 100.*train_correct/train_total
        train_loss /= len(train_loader)

        # val
        clip_visual.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                features = clip_visual(images)
                outputs = classifier(features)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, preds = outputs.max(1)
                val_total += labels.size(0)
                val_correct += preds.eq(labels).sum().item()

        val_acc = 100.*val_correct/val_total
        val_loss /= len(val_loader)

        if val_acc > best_val:
            best_val = val_acc
            best_model_state = {k: v.cpu() for k, v in clip_visual.state_dict().items()}

        scheduler.step()
        print(f"Epoch {epoch+1}/{epochs} - Train Acc: {train_acc:.2f}% Val Acc: {val_acc:.2f}% (Best: {best_val:.2f}%)")

    # load best weights back to device
    if best_model_state is not None:
        clip_visual.load_state_dict(best_model_state)

    return clip_visual, best_val

# Run Stage 2 for CLIP
unfreeze_last = 3
stage2_epochs = 10
stage2_lr = 1e-4

best_clip_visual_finetuned, best_clip_val = train_clip_backbone_with_frozen_classifier(
    clip_visual=clip_visual,
    classifier=best_clip_classifier,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=stage2_epochs,
    lr=stage2_lr,
    unfreeze_last=unfreeze_last
)

print(f"\nStage 2 (CLIP) complete. Best Val Acc during Stage2: {best_clip_val:.2f}%")

# Save the fine-tuned CLIP visual encoder for later stages
torch.save(best_clip_visual_finetuned.state_dict(), "clip_visual_stage2_finetuned.pth")
print("Saved CLIP visual Stage2 weights to clip_visual_stage2_finetuned.pth")



################################################################################
RETRAINING CLIP CLASSIFIERS WITH HIGHER EPOCHS (50)
################################################################################


--- CLIP: Testing 1-Layer Classifier ---
Epoch 1/50 - Train Loss: 2.2742, Train Acc: 14.62%, Val Loss: 2.2416, Val Acc: 17.26% (Best: 17.26% @ Epoch 1)
Epoch 5/50 - Train Loss: 2.0766, Train Acc: 27.62%, Val Loss: 2.1381, Val Acc: 22.12% (Best: 23.45% @ Epoch 4)
Epoch 10/50 - Train Loss: 1.9603, Train Acc: 33.97%, Val Loss: 2.0846, Val Acc: 25.22% (Best: 27.43% @ Epoch 9)
Epoch 15/50 - Train Loss: 1.8888, Train Acc: 34.56%, Val Loss: 2.0515, Val Acc: 28.32% (Best: 29.65% @ Epoch 12)
Epoch 10/50 - Train Loss: 1.9603, Train Acc: 33.97%, Val Loss: 2.0846, Val Acc: 25.22% (Best: 27.43% @ Epoch 9)
Epoch 15/50 - Train Loss: 1.8888, Train Acc: 34.56%, Val Loss: 2.0515, Val Acc: 28.32% (Best: 29.65% @ Epoch 12)
Epoch 20/50 - Train Loss: 1.8236, Train Acc: 39.14%, Val Loss: 2.0392,

Stage2 Train Epoch 1/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 1/10 - Train Acc: 17.73% Val Acc: 16.37% (Best: 16.37%)


Stage2 Train Epoch 2/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 2/10 - Train Acc: 13.29% Val Acc: 15.49% (Best: 16.37%)


Stage2 Train Epoch 3/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 3/10 - Train Acc: 16.54% Val Acc: 20.80% (Best: 20.80%)


Stage2 Train Epoch 4/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 4/10 - Train Acc: 26.88% Val Acc: 18.58% (Best: 20.80%)


Stage2 Train Epoch 5/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 5/10 - Train Acc: 29.69% Val Acc: 25.22% (Best: 25.22%)


Stage2 Train Epoch 6/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 6/10 - Train Acc: 31.91% Val Acc: 24.78% (Best: 25.22%)


Stage2 Train Epoch 7/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 7/10 - Train Acc: 34.42% Val Acc: 32.74% (Best: 32.74%)


Stage2 Train Epoch 8/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 8/10 - Train Acc: 39.44% Val Acc: 29.65% (Best: 32.74%)


Stage2 Train Epoch 9/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 9/10 - Train Acc: 39.59% Val Acc: 36.73% (Best: 36.73%)


Stage2 Train Epoch 10/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 10/10 - Train Acc: 43.43% Val Acc: 37.61% (Best: 37.61%)

Stage 2 (CLIP) complete. Best Val Acc during Stage2: 37.61%
Saved CLIP visual Stage2 weights to clip_visual_stage2_finetuned.pth


## Summary: CLIP Extended Training Complete

### Stage 1 Extended (50 epochs)
Best CLIP classifier trained on frozen backbone features

### Stage 2: Partial Backbone Training
- **Approach**: Freeze best classifier, unfreeze last 3 visual transformer blocks
- **Strategy**: Train backbone with frozen classification head
- **Best Val Accuracy**: 37.61%
- **Checkpoint saved**: `clip_visual_stage2_finetuned.pth`

In [15]:
# Install open_clip_torch (fresh cell to ensure availability)
%pip install -q open_clip_torch
print('open_clip_torch installed')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [20]:
# Reload CLIP model fresh for retraining
print("Reloading CLIP Base model (fresh)...")
import open_clip

clip_model_fresh, _, clip_preprocess_fresh = open_clip.create_model_and_transforms('ViT-B-16', pretrained='openai')
clip_model_fresh = clip_model_fresh.cuda()
clip_model_fresh.eval()

clip_visual_fresh = clip_model_fresh.visual
with torch.no_grad():
    dummy_input = torch.randn(1, 3, 224, 224).cuda()
    clip_features_test = clip_visual_fresh(dummy_input)
    clip_dim_fresh = clip_features_test.shape[1]

print(f"✓ CLIP Base reloaded - Feature dimension: {clip_dim_fresh}")
print(f"  Parameters: {sum(p.numel() for p in clip_visual_fresh.parameters()):,}")

Reloading CLIP Base model (fresh)...
✓ CLIP Base reloaded - Feature dimension: 512
  Parameters: 86,192,640


In [21]:
# Extract features from fresh CLIP model
print("="*80)
print("RE-EXTRACTING CLIP FEATURES")
print("="*80)

clip_train_features_fresh, _ = extract_features(clip_visual_fresh, train_loader, "CLIP-Base-Fresh")
clip_val_features_fresh, _ = extract_features(clip_visual_fresh, val_loader, "CLIP-Base-Fresh")
clip_test_features_fresh, _ = extract_features(clip_visual_fresh, test_loader, "CLIP-Base-Fresh")

print(f"\n✓ Fresh CLIP features extracted: {clip_train_features_fresh.shape}")

RE-EXTRACTING CLIP FEATURES
Extracting features from CLIP-Base-Fresh...


Extracting CLIP-Base-Fresh features:   0%|          | 0/11 [00:00<?, ?it/s]

✓ Extracted 677 feature vectors of dim 512
Extracting features from CLIP-Base-Fresh...


Extracting CLIP-Base-Fresh features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 512
Extracting features from CLIP-Base-Fresh...


Extracting CLIP-Base-Fresh features:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Extracted 226 feature vectors of dim 512

✓ Fresh CLIP features extracted: torch.Size([677, 512])


In [22]:
# Retrain CLIP classifiers (Stage 1 with 50 epochs)
print("\n" + "#"*80)
print("RETRAINING CLIP CLASSIFIERS (50 EPOCHS)")
print("#"*80 + "\n")

clip_stage1_retrain = []

for classifier_name, ClassifierClass in classifier_configs:
    print(f"\n--- CLIP: Training {classifier_name} Classifier ---")

    if classifier_name == "1-Layer":
        classifier = ClassifierClass(clip_dim_fresh, num_classes=10)
    elif classifier_name == "2-Layer":
        classifier = ClassifierClass(clip_dim_fresh, num_classes=10, hidden_dim=512, dropout=0.3)
    elif classifier_name == "3-Layer":
        classifier = ClassifierClass(clip_dim_fresh, num_classes=10, hidden_dim1=512, hidden_dim2=256, dropout=0.3)
    else:
        classifier = ClassifierClass(clip_dim_fresh, num_classes=10, hidden_dims=[512,256,128], dropout=0.3)

    trained_clf, best_val_acc, history = train_classifier_on_features(
        classifier=classifier,
        train_features=clip_train_features_fresh,
        train_labels=clip_train_labels,
        val_features=clip_val_features_fresh,
        val_labels=clip_val_labels,
        epochs=50,
        lr=1e-3,
        batch_size=64
    )

    test_acc = evaluate_classifier_on_features(trained_clf, clip_test_features_fresh, clip_test_labels, batch_size=64)

    clip_stage1_retrain.append({
        'classifier': classifier_name,
        'model': trained_clf,
        'best_val_acc': best_val_acc,
        'test_acc': test_acc,
        'history': history
    })

    print(f"✓ CLIP + {classifier_name}: Val {best_val_acc:.2f}%, Test {test_acc:.2f}%")

# Select best
best_clip_retrain = max(clip_stage1_retrain, key=lambda x: x['best_val_acc'])
best_clip_classifier_retrain = best_clip_retrain['model']

print(f"\n{'='*80}")
print(f"Best CLIP Classifier: {best_clip_retrain['classifier']}")
print(f"  Val: {best_clip_retrain['best_val_acc']:.2f}%")
print(f"  Test: {best_clip_retrain['test_acc']:.2f}%")
print(f"{'='*80}")


################################################################################
RETRAINING CLIP CLASSIFIERS (50 EPOCHS)
################################################################################


--- CLIP: Training 1-Layer Classifier ---
Epoch 1/50 - Train Loss: 2.2958, Train Acc: 13.59%, Val Loss: 2.2878, Val Acc: 14.60% (Best: 14.60% @ Epoch 1)
Epoch 5/50 - Train Loss: 2.2178, Train Acc: 19.79%, Val Loss: 2.3189, Val Acc: 12.83% (Best: 14.60% @ Epoch 1)
Epoch 10/50 - Train Loss: 2.1726, Train Acc: 21.42%, Val Loss: 2.3389, Val Acc: 9.29% (Best: 14.60% @ Epoch 1)
Epoch 15/50 - Train Loss: 2.1285, Train Acc: 24.67%, Val Loss: 2.3584, Val Acc: 9.73% (Best: 14.60% @ Epoch 1)
Epoch 20/50 - Train Loss: 2.0936, Train Acc: 26.59%, Val Loss: 2.3813, Val Acc: 9.29% (Best: 14.60% @ Epoch 1)
Epoch 25/50 - Train Loss: 2.0759, Train Acc: 29.10%, Val Loss: 2.3938, Val Acc: 7.96% (Best: 14.60% @ Epoch 1)
Epoch 30/50 - Train Loss: 2.0552, Train Acc: 28.80%, Val Loss: 2.4040, Val Acc: 7.52% (

In [23]:
# Stage 2: Partial backbone training for CLIP (retrain)
print("\n" + "#"*80)
print("CLIP STAGE 2: PARTIAL BACKBONE TRAINING (RETRAIN)")
print("#"*80 + "\n")

# helper to unfreeze last N blocks
def unfreeze_clip_last_n_v2(clip_visual, n=2):
    if hasattr(clip_visual, 'transformer') and hasattr(clip_visual.transformer, 'resblocks'):
        blocks = clip_visual.transformer.resblocks
    elif hasattr(clip_visual, 'blocks'):
        blocks = clip_visual.blocks
    else:
        raise RuntimeError('Cannot find CLIP transformer blocks')

    # freeze all
    for p in clip_visual.parameters():
        p.requires_grad = False

    # unfreeze last n blocks
    for block in list(blocks)[-n:]:
        for p in block.parameters():
            p.requires_grad = True

    # unfreeze projection
    if hasattr(clip_visual, 'proj'):
        if isinstance(clip_visual.proj, nn.Parameter):
            clip_visual.proj.requires_grad = True
        else:
            for p in clip_visual.proj.parameters():
                p.requires_grad = True
    if hasattr(clip_visual, 'ln_post'):
        for p in clip_visual.ln_post.parameters():
            p.requires_grad = True

    print(f"Unfroze last {n} blocks of CLIP visual encoder")


def train_clip_stage2_v2(clip_visual, classifier, train_loader, val_loader, 
                         epochs=10, lr=1e-4, unfreeze_last=3):
    device = torch.device('cuda')
    classifier = classifier.cuda()
    clip_visual = clip_visual.cuda()

    # freeze classifier
    for p in classifier.parameters():
        p.requires_grad = False

    # unfreeze last N
    unfreeze_clip_last_n_v2(clip_visual, n=unfreeze_last)

    # trainable params
    trainable_params = [p for p in clip_visual.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(trainable_params, lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()

    best_val = 0.0
    best_model_state = None

    for epoch in range(epochs):
        clip_visual.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for images, labels in tqdm(train_loader, desc=f"Stage2 Epoch {epoch+1}/{epochs}"):
            images, labels = images.to(device), labels.to(device)
            features = clip_visual(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = outputs.max(1)
            train_total += labels.size(0)
            train_correct += preds.eq(labels).sum().item()

        train_acc = 100.*train_correct/train_total
        train_loss /= len(train_loader)

        # val
        clip_visual.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                features = clip_visual(images)
                outputs = classifier(features)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, preds = outputs.max(1)
                val_total += labels.size(0)
                val_correct += preds.eq(labels).sum().item()

        val_acc = 100.*val_correct/val_total
        val_loss /= len(val_loader)

        if val_acc > best_val:
            best_val = val_acc
            best_model_state = {k: v.cpu().clone() for k, v in clip_visual.state_dict().items()}

        scheduler.step()
        print(f"Epoch {epoch+1}/{epochs} - Train: {train_acc:.2f}%, Val: {val_acc:.2f}% (Best: {best_val:.2f}%)")

    # restore best
    if best_model_state is not None:
        clip_visual.load_state_dict(best_model_state)
        clip_visual = clip_visual.cuda()

    return clip_visual, best_val


# Run Stage 2
clip_visual_stage2_retrain, best_val_stage2_retrain = train_clip_stage2_v2(
    clip_visual=clip_visual_fresh,
    classifier=best_clip_classifier_retrain,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,
    lr=1e-4,
    unfreeze_last=3
)

print(f"\n{'='*80}")
print(f"CLIP STAGE 2 COMPLETE (RETRAIN)")
print(f"Best Val Acc: {best_val_stage2_retrain:.2f}%")
print(f"{'='*80}")

# Save checkpoint
torch.save(clip_visual_stage2_retrain.state_dict(), "clip_visual_stage2_retrain.pth")
torch.save(best_clip_classifier_retrain.state_dict(), "clip_classifier_best_retrain.pth")
print("Saved: clip_visual_stage2_retrain.pth, clip_classifier_best_retrain.pth")


################################################################################
CLIP STAGE 2: PARTIAL BACKBONE TRAINING (RETRAIN)
################################################################################

Unfroze last 3 blocks of CLIP visual encoder


Stage2 Epoch 1/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 1/10 - Train: 9.90%, Val: 7.96% (Best: 7.96%)


Stage2 Epoch 2/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 2/10 - Train: 14.18%, Val: 12.83% (Best: 12.83%)


Stage2 Epoch 3/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 3/10 - Train: 17.28%, Val: 16.37% (Best: 16.37%)


Stage2 Epoch 4/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 4/10 - Train: 24.22%, Val: 20.35% (Best: 20.35%)


Stage2 Epoch 5/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 5/10 - Train: 24.08%, Val: 31.86% (Best: 31.86%)


Stage2 Epoch 6/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 6/10 - Train: 32.79%, Val: 29.65% (Best: 31.86%)


Stage2 Epoch 7/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 7/10 - Train: 33.53%, Val: 30.53% (Best: 31.86%)


Stage2 Epoch 8/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 8/10 - Train: 34.71%, Val: 31.42% (Best: 31.86%)


Stage2 Epoch 9/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 9/10 - Train: 37.37%, Val: 31.42% (Best: 31.86%)


Stage2 Epoch 10/10:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch 10/10 - Train: 39.14%, Val: 32.74% (Best: 32.74%)

CLIP STAGE 2 COMPLETE (RETRAIN)
Best Val Acc: 32.74%
Saved: clip_visual_stage2_retrain.pth, clip_classifier_best_retrain.pth


In [None]:
# Save checkpoints for all trained models
import os
os.makedirs('checkpoints', exist_ok=True)

saved = []

def _safe_save(model, path):
    try:
        state = model.state_dict()
        # move tensors to CPU to avoid device issues
        state_cpu = {k: v.cpu() for k, v in state.items()}
        torch.save(state_cpu, path)
        print(f"Saved: {path}")
        saved.append(path)
    except Exception as e:
        try:
            # if it's already a state dict
            torch.save(model, path)
            print(f"Saved (fallback): {path}")
            saved.append(path)
        except Exception as e2:
            print(f"Failed to save {path}: {e} | {e2}")

# 1) Save classifiers from stage1_results (if present)
if 'stage1_results' in globals():
    for r in stage1_results:
        backbone = r.get('backbone', 'backbone').replace(' ', '')
        clf_name = r.get('classifier', 'clf').replace(' ', '')
        model = r.get('model', None)
        if model is not None:
            fname = f"checkpoints/stage1_{backbone}_{clf_name}.pth"
            _safe_save(model, fname)

# 2) Save CLIP stage1 results (if present)
if 'clip_stage1_results' in globals():
    for r in clip_stage1_results:
        clf_name = r.get('classifier', 'clf').replace(' ', '')
        model = r.get('model', None)
        if model is not None:
            fname = f"checkpoints/clip_stage1_{clf_name}.pth"
            _safe_save(model, fname)

# 3) Save CLIP retrain classifiers
if 'clip_stage1_retrain' in globals():
    for r in clip_stage1_retrain:
        clf_name = r.get('classifier', 'clf').replace(' ', '')
        model = r.get('model', None)
        if model is not None:
            fname = f"checkpoints/clip_retrain_{clf_name}.pth"
            _safe_save(model, fname)

# 4) Save best clip classifiers if defined
for varname in ['best_clip_classifier', 'best_clip_classifier_retrain', 'trained_classifier', 'trained_clf', 'trained_clf_retrain']:
    if varname in globals():
        try:
            _safe_save(globals()[varname], f"checkpoints/{varname}.pth")
        except Exception:
            pass

# 5) Save fine-tuned visual encoders if available
for varname, fname in [
    ('best_clip_visual_finetuned', 'checkpoints/clip_visual_stage2_finetuned.pth'),
    ('clip_visual_stage2_retrain', 'checkpoints/clip_visual_stage2_retrain.pth'),
    ('dinov2_small', 'checkpoints/dinov2_small_base.pth')
]:
    if varname in globals():
        model = globals()[varname]
        # don't overwrite dinov2 pretrained weights; save only if modified
        if varname == 'dinov2_small':
            # save state dict of backbone (may be pretrained) for reproducibility
            _safe_save(model, fname)
        else:
            _safe_save(model, fname)

# 6) Save any explicitly saved filenames created earlier
for filename in ['clip_visual_stage2_finetuned.pth', 'clip_visual_stage2_retrain.pth', 'clip_classifier_best_retrain.pth']:
    if os.path.exists(filename):
        dest = os.path.join('checkpoints', filename)
        if not os.path.exists(dest):
            try:
                import shutil
                shutil.copy2(filename, dest)
                print(f"Copied {filename} to {dest}")
                saved.append(dest)
            except Exception as e:
                print(f"Failed to copy {filename}: {e}")

print('\nAll checkpoint save attempts finished.')
print('Files saved:', saved)
