In [1]:
import torch
import wandb
import os
import numpy as np
import torch.nn as nn
import pytorch_lightning as pl
import os, gc, torch
from torch.utils.data import DataLoader, random_split, SubsetRandomSampler
from torchvision import datasets, transforms, models
from pathlib import Path
from typing import Optional, Tuple
import torch.optim as optim
import torch.nn.functional as F
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from torchsummary import summary
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.tuner.tuning import Tuner
from torchvision.models.vision_transformer import vit_b_16
from typing import Literal
wandb.login(key = '5df7feeffbc5b918c8947f5fe4bab4b67ebfbb69')
# key = 5df7feeffbc5b918c8947f5fe4bab4b67ebfbb69

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/indramandal/.netrc
[34m[1mwandb[0m: Currently logged in as: [33med24s014[0m ([33med24s014-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:


class CustomDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_dir: str,
        image_size: Tuple[int, int] = (224, 224),
        batch_size: int = 64,
        val_split: float = 0.2,
        use_augmentation: bool = False,
        num_workers: int = 2,
        seed: int = 42
    ):
        """
        Custom Data Module for handling dataset loading, transformation, and splitting.

        Args:
            data_dir (str): Path to dataset directory.
            image_size (Tuple[int, int]): Target image size (height, width).
            batch_size (int): Batch size for DataLoader.
            val_split (float): Fraction of training data to use for validation.
            use_augmentation (bool): Whether to apply data augmentation.
            num_workers (int): Number of workers for DataLoader.
            seed (int): Random seed for reproducibility.
        """
        super().__init__()
        self.data_dir = Path(data_dir)
        self.image_size = image_size
        self.batch_size = batch_size
        self.val_split = val_split
        self.use_augmentation = use_augmentation
        self.num_workers = num_workers
        self.seed = seed
        self.class_names = []

        # Define transforms
        self.train_transform = self._get_train_transform()
        self.test_transform = self._get_test_transform()

    def _get_train_transform(self):
        if self.use_augmentation:
            return transforms.Compose([
                transforms.RandomResizedCrop(self.image_size),
                transforms.RandomHorizontalFlip(),
                transforms.RandomVerticalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), shear=10),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        else:
            return transforms.Compose([
                transforms.Resize(self.image_size),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])


    def _get_test_transform(self):
        """Defines transformation pipeline for validation and test data."""
        return transforms.Compose([
            transforms.Resize(self.image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def setup(self, stage: Optional[str] = None):
        """Loads datasets and applies transformations."""
        # Set manual seeds for reproducibility
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)

        # Load full dataset with train transform
        full_dataset = datasets.ImageFolder(root=self.data_dir, transform=self.train_transform)
        self.class_names = full_dataset.classes

        # Compute split sizes
        total_size = len(full_dataset)
        val_size = int(total_size * self.val_split)
        train_size = total_size - val_size

        # Split dataset into training and validation sets
        train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

        # Apply test transform to validation dataset
        val_dataset.dataset.transform = self.test_transform

        # Assign datasets
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset

    def train_dataloader(self):
        """Returns DataLoader for training data."""
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, persistent_workers=True)

    def val_dataloader(self):
        """Returns DataLoader for validation data."""
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, persistent_workers=True)

    def test_dataloader(self, test_dir: Optional[str] = None):
        """Returns DataLoader for test data."""
        test_path = Path(test_dir) if test_dir else self.data_dir.parent / "val"
        test_dataset = datasets.ImageFolder(root=test_path, transform=self.test_transform)
        return DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)


In [3]:
class LitClassifier(pl.LightningModule):
    def __init__(
        self,
        model_name: str,
        num_classes: int,
        lr: float = 1e-3,
        finetune_strategy: Literal['freeze_all', 'freeze_partial', 'unfreeze_all'] = 'freeze_all',
        k_layers: int = 0
    ):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr

        # Load the pre-trained model and replace final layer
        self.model = self._load_model(model_name, num_classes)

        # Apply fine-tuning strategy
        self._apply_finetune_strategy(finetune_strategy, k_layers)

    def _load_model(self, model_name, num_classes):
        if model_name == 'resnet50':
            model = models.resnet50(pretrained=True)
            model.fc = nn.Linear(model.fc.in_features, num_classes)
        elif model_name == 'vgg16':
            model = models.vgg16(pretrained=True)
            model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
        elif model_name == 'inception_v3':
            model = models.inception_v3(pretrained=True, aux_logits=True)  # <- Set aux_logits=True
            model.fc = nn.Linear(model.fc.in_features, num_classes)
            model.aux_logits = False  # <- Disable aux logits at inference time
        elif model_name == 'googlenet':
            model = models.googlenet(pretrained=True, aux_logits=True)
            model.fc = nn.Linear(model.fc.in_features, num_classes)
            model.aux_logits = False
        elif model_name == 'efficientnet_v2_s':
            model = models.efficientnet_v2_s(pretrained=True)
            model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
        elif model_name == 'vit_b_16':
            model = vit_b_16(pretrained=True)
            model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)
        else:
            raise ValueError(f"Unsupported model: {model_name}")
        return model


    def _apply_finetune_strategy(self, strategy, k_layers):
        all_layers = list(self.model.children())

        if strategy == 'freeze_all':
            for param in self.model.parameters():
                param.requires_grad = False
            self._unfreeze_final_classifier()

        elif strategy == 'freeze_partial':
            for i, layer in enumerate(all_layers):
                if i < len(all_layers) - k_layers:
                    for param in layer.parameters():
                        param.requires_grad = False
            self._unfreeze_final_classifier()

        elif strategy == 'unfreeze_all':
            for param in self.model.parameters():
                param.requires_grad = True

    def _unfreeze_final_classifier(self):
        if hasattr(self.model, 'fc'):
            for param in self.model.fc.parameters():
                param.requires_grad = True
        elif hasattr(self.model, 'classifier'):
            for param in self.model.classifier.parameters():
                param.requires_grad = True
        elif hasattr(self.model, 'heads'):
            for param in self.model.heads.parameters():
                param.requires_grad = True

    def forward(self, x):
        output = self.model(x)
        if isinstance(output, tuple):  # Some models like Inception return (main, aux)
            output = output[0]
        return output


    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr)


In [7]:


# Step 1: Set hyperparameters here
MODEL_NAME = "efficientnet_v2_s"  # Options: resnet50, vgg16, inception_v3, googlenet, efficientnet_v2_s, vit_b_16
USE_AUGMENTATION = True
FINETUNE_STRATEGY = "freeze_partial"  # Options: freeze_all, freeze_partial, unfreeze_all
K_LAYERS = 2  # Only used when freeze_partial
LEARNING_RATE = 1e-4
BATCH_SIZE = 16
IMAGE_SIZE = {
    "resnet50": (224, 224),
    "vgg16": (224, 224),
    "inception_v3": (299, 299),
    "googlenet": (224, 224),
    "efficientnet_v2_s": (384, 384),
    "vit_b_16": (224, 224),
}[MODEL_NAME]
NUM_EPOCHS = 15
SEED = 42

# Step 2: Initialize the DataModule
data_module = CustomDataModule(
    data_dir="/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/inaturalist_12K/train",
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    val_split=0.2,
    use_augmentation=USE_AUGMENTATION,
    seed=SEED
)
data_module.setup(stage="fit")


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gc.collect()
torch.cuda.empty_cache()

# Step 3: Initialize the model
model = LitClassifier(
    model_name=MODEL_NAME,
    num_classes=len(data_module.class_names),
    lr=LEARNING_RATE,
    finetune_strategy=FINETUNE_STRATEGY,
    k_layers=K_LAYERS
)

# Step 4: Print model summary
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Detailed Model Summary:")
summary(model, input_size=(3, *IMAGE_SIZE))


# Step 5: Configure Trainer
trainer = Trainer(
    max_epochs=10,
    precision="16-mixed",
    accumulate_grad_batches=2,  # new
    callbacks=[EarlyStopping(monitor="val_acc", mode="max", patience=3)]
)

# Step 6: Train the model
trainer.fit(model, datamodule=data_module)




Detailed Model Summary:


Using 16bit Automatic Mixed Precision (AMP)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 192, 192]             648
       BatchNorm2d-2         [-1, 24, 192, 192]              48
              SiLU-3         [-1, 24, 192, 192]               0
            Conv2d-4         [-1, 24, 192, 192]           5,184
       BatchNorm2d-5         [-1, 24, 192, 192]              48
              SiLU-6         [-1, 24, 192, 192]               0
   StochasticDepth-7         [-1, 24, 192, 192]               0
       FusedMBConv-8         [-1, 24, 192, 192]               0
            Conv2d-9         [-1, 24, 192, 192]           5,184
      BatchNorm2d-10         [-1, 24, 192, 192]              48
             SiLU-11         [-1, 24, 192, 192]               0
  StochasticDepth-12         [-1, 24, 192, 192]               0
      FusedMBConv-13         [-1, 24, 192, 192]               0
           Conv2d-14           [-1, 96,


  | Name  | Type         | Params | Mode 
-----------------------------------------------
0 | model | EfficientNet | 20.2 M | train
-----------------------------------------------
12.8 K    Trainable params
20.2 M    Non-trainable params
20.2 M    Total params
80.761    Total estimated model params size (MB)
714       Modules in train mode
0         Modules in eval mode


Epoch 0:   3%|▎         | 13/500 [00:04<03:02,  2.67it/s, v_num=62, train_loss=2.320, train_acc=0.0625]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [4]:
sweep_config = {
    "method": "bayes",  # or "grid"
    "metric": {
        "name": "val_acc",
        "goal": "maximize"
    },
    "parameters": {
        "model_name": {
            "values": ["resnet50", "vgg16", "inception_v3", "googlenet", "efficientnet_v2_s", "vit_b_16"]
        },
        "lr": {
            "values": [1e-3, 1e-4, 1e-5]
        },
        "batch_size": {
            "values": [16, 32]
        },
        "finetune_strategy": {
            "values": ["freeze_all", "freeze_partial", "unfreeze_all"]
        },
        "k_layers": {
            "values": [1, 2, 3]
        },
        "use_augmentation": {
            "values": [True, False]
        }
    }
}


In [None]:
def sweep_train(config=None):
    with wandb.init(config=config):
        config = wandb.config

        IMAGE_SIZE = {
            "resnet50": (224, 224),
            "vgg16": (224, 224),
            "inception_v3": (299, 299),
            "googlenet": (224, 224),
            "efficientnet_v2_s": (384, 384),
            "vit_b_16": (224, 224),
        }[config.model_name]

        #  Step 2: Initialize the DataModule
        data_module = CustomDataModule(
            data_dir="/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/inaturalist_12K/train",
            image_size=IMAGE_SIZE,
            batch_size=config.batch_size,
            val_split=0.2,
            use_augmentation=config.use_augmentation,
            seed=42
        )
        data_module.setup(stage="fit")


        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        gc.collect()
        torch.cuda.empty_cache()

        # Step 3: Initialize the model
        model = LitClassifier(
            model_name=config.model_name,
            num_classes=len(data_module.class_names),
            lr=config.lr,
            finetune_strategy=config.finetune_strategy,
            k_layers=config.k_layers
        )

        # Step 4: Print model summary
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
        print("Detailed Model Summary:")
        summary(model, input_size=(3, *IMAGE_SIZE))

        wandb_logger = WandbLogger(project="finetune", log_model=True)

        trainer = Trainer(
            max_epochs=15,
            precision="16-mixed",
            accumulate_grad_batches=2,
            callbacks=[EarlyStopping(monitor="val_acc", mode="max", patience=3)],
            logger=wandb_logger
        )

        trainer.fit(model, datamodule=data_module)


In [6]:
sweep_id = wandb.sweep(sweep_config, project="finetune")
wandb.agent(sweep_id, function=sweep_train, count=10)  # adjust count



Create sweep with ID: ahtwrnd7
Sweep URL: https://wandb.ai/ed24s014-indian-institute-of-technology-madras/finetune/sweeps/ahtwrnd7


[34m[1mwandb[0m: Agent Starting Run: 4ehyqna7 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	finetune_strategy: unfreeze_all
[34m[1mwandb[0m: 	k_layers: 1
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	model_name: resnet50
[34m[1mwandb[0m: 	use_augmentation: True


Using 16bit Automatic Mixed Precision (AMP)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Detailed Model Summary:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm

/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/env/lib/python3.9/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.

  | Name  | Type   | Params | Mode 
-----------------------------------------
0 | model | ResNet | 23.5 M | train
-----------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)
151       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]



Epoch 4:   5%|▌         | 25/500 [00:12<03:50,  2.06it/s, v_num=qna7, train_loss=1.470, train_acc=0.500, val_loss=1.660, val_acc=0.421]  

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Traceback (most recent call last):
  File "/var/folders/16/2v843_gd4x1cty4mwj7r2t780000gn/T/ipykernel_74894/3351227780.py", line 55, in sweep_train
    trainer.fit(model, datamodule=data_module)
  File "/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/env/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
    call._call_and_handle_interrupt(
  File "/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/env/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 48, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/env/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/Users/indramandal/Documents/VS_CODE/DA6401/DA6401_Assignment_2/env/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
    results = self._

In [13]:
wandb.finish()