## Task 2 - CLIP Fine-Tuning on the Visual Encoder

In [None]:
#@title GPU / Python / Torch sanity
import os, sys, subprocess, json, platform, torch
print("Python :", sys.version)
print("CUDA   :", torch.version.cuda)
print("Torch  :", torch.__version__)
print("Device :", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
!nvidia-smi || true

In [None]:
# some imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from transformers import CLIPProcessor, CLIPModel, CLIPVisionModel, logging
from peft import LoraConfig, get_peft_model, TaskType
from torchinfo import summary
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import json
import warnings

In [None]:
# some settings
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "openai/clip-vit-large-patch14" # pre-trained CLIP model (ViT-L/14)
BATCH_SIZE = 1 # adjust based on your GPU memory
gradient_accumulation_steps = 1 # adjust based on your GPU memory
# For Linear Probe & LoRA
NUM_EPOCHS = 1
print(f"Using device: {DEVICE}")

In [None]:
# CLIP settings
# --- Load CLIP Processor ---
processor = CLIPProcessor.from_pretrained(MODEL_ID)
# --- Define a transform to process images for CLIP ---
class CLIPTransform:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, image):
        # The processor expects a PIL image or list of images
        # It returns a dict, we extract 'pixel_values'
        # .squeeze(0) removes the batch dimension the processor adds
        return self.processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

clip_transform = CLIPTransform(processor)

In [None]:
# dataset related imports
from torchvision.datasets import Flowers102 
from datasets import load_dataset

# --- Flowers102 ---
# prepare Flowers102 dataset
flowers102_test_dts = Flowers102(root="", split="test", transform=object) # evaluation on this set
# you should prepare the training set

print(f"Total training samples: {len(flowers102_train_dts)}")
print(f"Total validation samples: {len(flowers102_val_dts)}")
print(f"Total test samples: {len(flowers102_test_dts)}") # should be 6149

# prepare class names for Flowers102
"""
...
"""

# --- CUB-200-2011 ---
birds_200 = load_dataset("bentrevett/caltech-ucsd-birds-200-2011")
cub_bird_test_dts = birds_200["test"]
# you should prepare the training set

print(f"Total training samples: {len(cub_bird_train_dts)}")
print(f"Total validation samples: {len(cub_bird_val_dts)}")
print(f"Total test samples: {len(cub_bird_test_dts)}") # should be 5794

# prepare class names for CUB-200-2011
"""
...
"""

# === Create DataLoaders ===



In [None]:
print("--- Starting Method: Linear Probing ---")

# === 1. Load CLIP Vision Model (no text part) ===
model = ...

# === 2. Freeze backbone ===
for p in vision_model.parameters():
    p.requires_grad = False
for p in visual_projection.parameters():
    p.requires_grad = False

# === 3. Classifier head ===
head = # ...
    
# === 4. Training setup ===
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(object, lr=lr)
train_losses, val_losses, val_accuracies = [], [], []

# === 5. Training Loop ===
for epoch in range(NUM_EPOCHS):
    head.train()
    """
    ...
    """
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]"):
        pass
        """
        ...
        """
    
    print(f"Epoch {epoch+1} - Train Loss: {train_losses[-1]:.4f} | "
          f"Val Loss: {val_losses[-1]:.4f} | Val Acc: {val_accuracies[-1]*100:.2f}% | "
          f"Time: {epoch_end - epoch_start:.2f} sec")

# === 6. Plot curves ===


# === 7. Test ===


# === 8. Visualization ===


In [None]:
print("--- Starting Method: LoRA Fine-Tuning ---")

# === 1. Load CLIP Vision Model (no text part) ===
model = ...

# === 2. LoRA config (Q/V projections) ===
lora_config = LoraConfig(
    "..."
)

# === 3. Wrap with PEFT ===
vision_model_lora = get_peft_model(vision_model, lora_config)
print("LoRA Model - Trainable Parameters:")
vision_model_lora.print_trainable_parameters()

# === 4. Freeze projection ===
for p in visual_projection.parameters():
    p.requires_grad = False

# === 5. Training setup ===
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(object, lr=lr)
train_losses, val_losses, val_accuracies = [], [], []

# === 5. Training Loop ===
for epoch in range(NUM_EPOCHS):
    head.train()
    """
    ...
    """
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Train]"):
        pass
        """
        ...
        """
    
    print(f"Epoch {epoch+1} - Train Loss: {train_losses[-1]:.4f} | "
          f"Val Loss: {val_losses[-1]:.4f} | Val Acc: {val_accuracies[-1]*100:.2f}% | "
          f"Time: {epoch_end - epoch_start:.2f} sec")

# === 6. Plot curves ===


# === 7. Test ===


# === 8. Visualization ===
