In [1]:
# Standard PyTorch + Torchvision stack
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
from sklearn.metrics import f1_score, confusion_matrix

# Reproducibility (essential for research and debugging)
import random
SEED = 1337
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Note: For complete reproducibility, you may also need:
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

# Device (GPU if available)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Let's examine a fresh ResNet-18 pretrained on ImageNet
res18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)


## 3) Why Transfer Learning? The Power of Pretrained Features

### The Transfer Learning Hypothesis

Networks trained on large datasets (like ImageNet with 1.2M images, 1000 classes) learn hierarchical features:

Early layers (conv1, layer1): Low-level features (edges, textures, colors)

Middle layers (layer2, layer3): Mid-level features (shapes, parts, patterns)

Deep layers (layer4): High-level, task-specific features (object parts)

Final layer (fc): Class-specific decision boundaries

Key Insight: Low and mid-level features are universal across vision tasks! We can reuse them and only adapt the high-level features to our new task.

## Fine-Tuning Strategies

### 1) Feature Extraction *(Freeze all, train head)*
- **Pros:** Fastest; lowest overfitting risk  
- **Use when:** Limited data; domain ≈ ImageNet  
- **Unfrozen:** `fc` only

---

### 2) Shallow Fine-Tuning *(Unfreeze layer4 + head)*
- **Pros:** Adapts high-level features; still efficient  
- **Use when:** Moderate data; somewhat different domain  
- **Unfrozen:** `layer4`, `fc`

---

### 3) Deep Fine-Tuning *(Unfreeze layer3 + layer4 + head)*
- **Pros:** Greater adaptation capacity  
- **Use when:** Sufficient data; noticeable domain shift  
- **Unfrozen:** `layer3`, `layer4`, `fc`

---

### 4) Full Fine-Tuning *(Unfreeze everything)*
- **Pros:** Maximum flexibility  
- **Cons:** Slowest; higher overfitting risk  
- **Use when:** Large dataset; very different domain  
- **Unfrozen:** all layers

---

### Practical Tips
- Prefer **smaller LR** for earlier layers (discriminative LRs).
- Add regularization when unfreezing more (augmentations, weight decay, label smoothing).
- Monitor validation; consider early stopping/checkpointing.





## 4) Data Preprocessing: Why ImageNet Statistics?

### Understanding ImageNet Normalization
Pretrained networks expect inputs with specific statistics because they were trained on normalized ImageNet data:

In [4]:
# ImageNet channel-wise statistics (computed over millions of images)
IMAGENET_MEAN = [0.485, 0.456, 0.406]  # Mean per channel (R, G, B)
IMAGENET_STD  = [0.229, 0.224, 0.225]  # Std dev per channel

# Why these specific values?
# - They center the data around 0 and scale to ~[-2, 2] range
# - This matches the distribution the network was trained on
# - Network weights are calibrated to these input scales

In [5]:
import kagglehub

path = kagglehub.dataset_download("grassknoted/asl-alphabet")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'asl-alphabet' dataset.
Path to dataset files: /kaggle/input/asl-alphabet


In [9]:
from google.colab import files
files.upload()

KeyboardInterrupt: 

In [None]:
!unzip /content/Train_Data.zip -d /content/

In [6]:
!cp -r "/content/drive/MyDrive/Adams.Daen429.Final.Proj/Given_Test" /content/

In [8]:
IMG_SIZE = 224          # Standard ImageNet size
BATCH_SIZE = 64

# Training transforms: Add variability to prevent overfitting
train_tf = transforms.Compose([
    # 1. Resize: ASL images need to be 224×224
    transforms.Resize((IMG_SIZE, IMG_SIZE)),

    # 3. Convert to tensor: PIL Image → Tensor, scales to [0,1]
    transforms.ToTensor(),

    # 4. Normalize: Match ImageNet statistics
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
    # This does: output = (input - mean) / std
])

# Validation transforms: No augmentation (we want consistent evaluation)
val_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
])
print('here')
# Load ASL dataset
full_train_ds = datasets.ImageFolder(root="/kaggle/input/asl-alphabet/asl_alphabet_train/asl_alphabet_train",transform=train_tf)
print('in between')
given_test_ds = datasets.ImageFolder(root="/content/Given_Test",transform=val_tf)

#full_train_ds = datasets.ImageFolder(root="/content/drive/MyDrive/Adams.Daen429.Final.Proj/Train_Data", transform=train_tf)
#given_test_ds = datasets.ImageFolder(root="/content/drive/MyDrive/Adams.Daen429.Final.Proj/Given_Test",transform=val_tf)

print('here')
#Need to split full train into validation set
indices = np.arange(len( full_train_ds ) )
labels = np.array(full_train_ds.targets ) # ASL labels
train_idx , val_idx = train_test_split(indices , test_size =0.2 , stratify = labels , random_state =429)


train_subset_ds = Subset(full_train_ds, train_idx)

# Create the stratified Validation Subset
val_subset_ds = Subset(full_train_ds, val_idx)


train_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

test_loader = DataLoader(
    given_test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

NUM_CLASSES = 29
print(f' Dataset: {len(full_train_ds):,} train, {len(given_test_ds):,} val')
print(f' Classes: {full_train_ds.classes} and test classes {given_test_ds.classes}')

here
in between
here
 Dataset: 87,000 train, 28 val
 Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space'] and test classes ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'nothing', 'space']


## 5) Model Setup: Adapting ResNet-18 for ASL Translation

Replacing the Classification Head

The pretrained ResNet-18 outputs 1000 classes (ImageNet), but we need 28 (ASL no del):

In [10]:
# Start with ImageNet-pretrained weights
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

# Examine the original classifier
print(" Original FC layer:")
print(f"  Input features: {model.fc.in_features}")
print(f"  Output features: {model.fc.out_features} (ImageNet classes)")

# Replace with our custom classifier
# The in_features must match (512 for ResNet-18's final feature size)
# The NUM_CLASSES will change for other datasets
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)

print("\n New FC layer:")
print(f"  Input features: {model.fc.in_features}")
print(f"  Output features: {model.fc.out_features} (our classes)")

# Move model to GPU if available
model = model.to(DEVICE)

 Original FC layer:
  Input features: 512
  Output features: 1000 (ImageNet classes)

 New FC layer:
  Input features: 512
  Output features: 29 (our classes)


## Understanding Parameter Names and Hierarchy
To selectively freeze/unfreeze layers, we need to understand PyTorch's parameter naming:

In [11]:
def explore_model_structure(model, max_depth=2):
    """Visualize the model's hierarchical structure"""

    print("\n Model Structure (hierarchical view):")

    def print_module(module, prefix="", depth=0):
        if depth >= max_depth:
            return
        for name, child in module.named_children():
            param_count = sum(p.numel() for p in child.parameters())
            trainable = sum(p.numel() for p in child.parameters() if p.requires_grad)
            print(f"{prefix}├── {name}: {child.__class__.__name__} "
                  f"({param_count:,} params, {trainable:,} trainable)")
            if depth < max_depth - 1:
                print_module(child, prefix + "│   ", depth + 1)

    print_module(model)

# Explore structure
explore_model_structure(model)


 Model Structure (hierarchical view):
├── conv1: Conv2d (9,408 params, 9,408 trainable)
├── bn1: BatchNorm2d (128 params, 128 trainable)
├── relu: ReLU (0 params, 0 trainable)
├── maxpool: MaxPool2d (0 params, 0 trainable)
├── layer1: Sequential (147,968 params, 147,968 trainable)
│   ├── 0: BasicBlock (73,984 params, 73,984 trainable)
│   ├── 1: BasicBlock (73,984 params, 73,984 trainable)
├── layer2: Sequential (525,568 params, 525,568 trainable)
│   ├── 0: BasicBlock (230,144 params, 230,144 trainable)
│   ├── 1: BasicBlock (295,424 params, 295,424 trainable)
├── layer3: Sequential (2,099,712 params, 2,099,712 trainable)
│   ├── 0: BasicBlock (919,040 params, 919,040 trainable)
│   ├── 1: BasicBlock (1,180,672 params, 1,180,672 trainable)
├── layer4: Sequential (8,393,728 params, 8,393,728 trainable)
│   ├── 0: BasicBlock (3,673,088 params, 3,673,088 trainable)
│   ├── 1: BasicBlock (4,720,640 params, 4,720,640 trainable)
├── avgpool: AdaptiveAvgPool2d (0 params, 0 trainable)
├── f

## 6) Freezing and Unfreezing: The Core Mechanism
### How Freezing Works
When we "freeze" a layer, we set requires_grad=False on its parameters:

Frozen parameters: No gradients computed, no updates during backprop

Unfrozen parameters: Gradients computed, weights updated

In [12]:
def set_requires_grad(module: nn.Module, requires_grad: bool):
    """
    Recursively set requires_grad for all parameters in a module.

    Args:
        module: PyTorch module (layer, block, or entire model)
        requires_grad: True to unfreeze (train), False to freeze
    """
    for param in module.parameters():
        param.requires_grad = requires_grad

    # Print status
    param_count = sum(p.numel() for p in module.parameters())
    status = "UNFROZEN (trainable)" if requires_grad else "FROZEN"
    print(f"  {module.__class__.__name__}: {param_count:,} parameters {status}")

# Example: Freeze entire model, then selectively unfreeze
print(" Freezing entire model...")
set_requires_grad(model, False)

print("\n Unfreezing only the FC layer...")
set_requires_grad(model.fc, True)

# Verify what's trainable
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\n Trainable: {trainable_params:,} / {total_params:,} parameters "
      f"({100*trainable_params/total_params:.6f}%)")

 Freezing entire model...
  ResNet: 11,191,389 parameters FROZEN

 Unfreezing only the FC layer...
  Linear: 14,877 parameters UNFROZEN (trainable)

 Trainable: 14,877 / 11,191,389 parameters (0.132933%)


## 7) Training Infrastructure

Training and Evaluation Functions

In [16]:
criterion = nn.CrossEntropyLoss()

def train_one_epoch(model, loader, optimizer):
    """
    Train for one epoch.

    Returns:
        tuple: (average_loss, accuracy)
    """
    model.train()  # Enable dropout, batch norm training mode

    total_samples = 0
    correct_predictions = 0
    running_loss = 0.0

    for batch_idx, (images, labels) in enumerate(loader):
        # Move data to device (GPU/CPU)
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        # Forward pass
        optimizer.zero_grad()  # Clear previous gradients
        logits = model(images)
        loss = criterion(logits, labels)

        # Backward pass
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

        # Track metrics
        running_loss += loss.item() * images.size(0)
        predictions = logits.argmax(dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_samples += images.size(0)

        # Optional: Print progress
        if batch_idx % 100 == 0:
            print(f"    Batch {batch_idx}/{len(loader)}, "
                  f"Loss: {loss.item():.4f}")
        #print('Here 1')
    avg_loss = running_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

@torch.no_grad()  # Decorator disables gradient computation
def evaluate(model, loader):
    """
    Evaluate model on validation/test set.

    Returns:
        tuple: (average_loss, accuracy)
    """
    print('Reached Evaluate')
    model.eval()  # Disable dropout, batch norm eval mode

    total_samples = 0
    correct_predictions = 0
    running_loss = 0.0
    all_preds = [] # <-- Collects all predicted labels
    all_targets = []

    for images, labels in loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        # Forward pass only (no backward)
        logits = model(images)
        loss = criterion(logits, labels)




        # Track metrics
        running_loss += loss.item() * images.size(0)
        predictions = logits.argmax(dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_samples += images.size(0)


        all_preds.extend(predictions.cpu().tolist())
        # FIX HERE: Use 'labels' instead of 'targets'
        all_targets.extend(labels.cpu().tolist())


    avg_loss = running_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy, all_targets, all_preds

In [14]:
#Define dictionary that is going to hold the loss accross epochs for each model for comparison purposes
total_loss_by_epoch_model = {}

## 8) Phase 1.1: Head-Only Fine-Tuning (Feature Extraction)
Strategy: Use ResNet as a Fixed Feature Extractor

In this phase, we:

1. Freeze all convolutional layers (keep ImageNet features)

2. Train only the new classifier head (learn new class boundaries)

3. Use higher learning rate (since we're training from scratch)

This is the safest approach with limited data!

In [17]:

#This is going to hold the losses for the different hyper paramate combinations
head_loss_epoch_train = {}
head_loss_epoch_val ={}
# Hyperparameters for Phase 1

EPOCHS_HEAD_ONLY = 3
LR_HEAD = [1e-3, 1e-2, .1]

print("\n" + "="*60)
print(" PHASE 1: HEAD-ONLY FINE-TUNING")
print("="*60)



BATCH_SIZE = [64, 128, 256]

for x in range(3):
    #redefine model for each set of hyper paramaters
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
    model = model.to(DEVICE)

    #Freeze Layers every Loop iteration
    set_requires_grad(model, False)
    set_requires_grad(model.fc, True)

    #Create dictionary entry for losses by epoch across val and train
    head_loss_epoch_train[x]=[]
    head_loss_epoch_val[x]=[]

    train_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    val_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    if x == 0:
        optimizer = optim.Adam(trainable_params, lr=LR_HEAD[x])
    elif x==1:
        optimizer = optim.SGD(trainable_params, lr=LR_HEAD[x])

    elif x==2:
        optim.Adagrad(trainable_params, lr=LR_HEAD[x])
    print(f"\n Optimizer setup:")
    print(f"   Learning rate: {LR_HEAD[x]}")
    print(f"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Step 4: Training loop
    print("\n Training progress:")
    print("-" * 60)

    best_val_acc = 0.0
    for epoch in range(1, EPOCHS_HEAD_ONLY + 1):
        print(f"\nEpoch {epoch}/{EPOCHS_HEAD_ONLY}")

        # Train
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)

        # Validate
        val_loss, val_acc, true_labels_e, pred_labels_e = evaluate(model, val_loader)

        #Append losses for plotting purposes
        head_loss_epoch_train[x].append(train_loss)
        head_loss_epoch_val[x].append(val_loss)

        # Track best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc

            best_true_labels = true_labels_e
            best_pred_labels = pred_labels_e
            # Optional: Save best model
            # torch.save(model.state_dict(), 'best_model_phase1.pth')

        print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
        print(f"   Val:   Loss={val_loss:.4f}, Acc={val_acc:.3f} "
          f"{' New best!' if val_acc == best_val_acc else ''}")

    print("\n Phase 1 Complete!")
    print(f"   Best validation accuracy: {best_val_acc:.3f}")

    #Best Epoch F1 Score
    macro_f1 = f1_score(best_true_labels, best_pred_labels,
                        average='macro', labels=list(range(len(full_train_ds.classes))))

    #Confustion Matrix
    conf_matrix = confusion_matrix(best_true_labels, best_pred_labels,
                                   labels=list(range(len(full_train_ds.classes))))

    print(f"Macro-F1 Score (Best Epoch): {macro_f1:.4f}")
    print("\nConfusion Matrix (True vs. Predicted):\n")
    print(conf_matrix)


 PHASE 1: HEAD-ONLY FINE-TUNING

 Freezing all layers...
  ResNet: 11,191,389 parameters FROZEN

 Unfreezing classifier head...
  Linear: 14,877 parameters UNFROZEN (trainable)

 Optimizer setup:
   Learning rate: 0.001
   Trainable params: 14,877

 Training progress:
------------------------------------------------------------

Epoch 1/3
    Batch 0/1088, Loss: 2.6674
    Batch 100/1088, Loss: 1.4809
    Batch 200/1088, Loss: 0.9902
    Batch 300/1088, Loss: 0.7204
    Batch 400/1088, Loss: 0.5434
    Batch 500/1088, Loss: 0.5824
    Batch 600/1088, Loss: 0.4506
    Batch 700/1088, Loss: 0.5051
    Batch 800/1088, Loss: 0.4005
    Batch 900/1088, Loss: 0.4230
    Batch 1000/1088, Loss: 0.4672
   Train: Loss=0.7441, Acc=0.856
   Val:   Loss=0.3098, Acc=0.941  New best!

Epoch 2/3
    Batch 0/1088, Loss: 0.3416
    Batch 100/1088, Loss: 0.3186
    Batch 200/1088, Loss: 0.2835
    Batch 300/1088, Loss: 0.2847
    Batch 400/1088, Loss: 0.3211
    Batch 500/1088, Loss: 0.3403
    Batch 60



    Batch 0/544, Loss: 0.1302
    Batch 100/544, Loss: 0.1012
    Batch 200/544, Loss: 0.1566
    Batch 300/544, Loss: 0.0885
    Batch 400/544, Loss: 0.1313
    Batch 500/544, Loss: 0.1392
   Train: Loss=0.1318, Acc=0.975
   Val:   Loss=0.1209, Acc=0.978  New best!

Epoch 2/3
    Batch 0/544, Loss: 0.1241
    Batch 100/544, Loss: 0.1378
    Batch 200/544, Loss: 0.1993
    Batch 300/544, Loss: 0.1158
    Batch 400/544, Loss: 0.1166
    Batch 500/544, Loss: 0.1436
   Train: Loss=0.1294, Acc=0.976
   Val:   Loss=0.1185, Acc=0.978  New best!

Epoch 3/3
    Batch 0/544, Loss: 0.1183
    Batch 100/544, Loss: 0.1886
    Batch 200/544, Loss: 0.1172
    Batch 300/544, Loss: 0.1059
    Batch 400/544, Loss: 0.1160
    Batch 500/544, Loss: 0.0675
   Train: Loss=0.1289, Acc=0.975
   Val:   Loss=0.1181, Acc=0.978 

 Phase 1 Complete!
   Best validation accuracy: 0.978
Macro-F1 Score (Best Epoch): 0.9784

Confusion Matrix (True vs. Predicted):

[[2361    7    0    0    7    0    0    0    2    0    



    Batch 0/272, Loss: 0.1425
    Batch 100/272, Loss: 0.1386
    Batch 200/272, Loss: 0.0773


KeyboardInterrupt: 

# TB Model

In [None]:
#Redefine model for phase 2
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(DEVICE)


#This is going to hold the losses for the different hyper paramate combinations
TB_loss_epoch_train = {}
TB_loss_epoch_val ={}
# Hyperparameters for Phase 1

EPOCHS_TB = 3
LR_TB = [1e-3, 1e-2, .1]

print("\n" + "="*60)
print(" PHASE 1: HEAD-ONLY FINE-TUNING")
print("="*60)

# Step 1: Freeze entire model
print("\n Freezing all layers...")
set_requires_grad(model, False)

# Step 2: Unfreeze the classifier head and last layer
print("\n Unfreezing classifier head...")
set_requires_grad(model.fc, True)
set_requires_grad(model.layer4, True)


# Step 3: Create optimizer for ONLY trainable parameters
# filter() ensures we only optimize parameters with requires_grad=True

BATCH_SIZE = [32, 64, 128]

for x in range(3):
    TB_loss_epoch_train[x]=[]
    TB_loss_epoch_val[x]=[]

    train_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    val_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    if x == 0:
        optimizer = optim.Adam(trainable_params, lr=LR_TB[x])
    elif x==1:
        optimizer = optim.SGD(trainable_params, lr=LR_TB[x])

    elif x==2:
        optim.Adagrad(trainable_params, lr=LR_TB[x])
    print(f"\n Optimizer setup:")
    print(f"   Learning rate: {LR_TB[x]}")
    print(f"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Step 4: Training loop
    print("\n Training progress:")
    print("-" * 60)

    best_val_acc = 0.0
    for epoch in range(1, EPOCHS_TB + 1):
        print(f"\nEpoch {epoch}/{EPOCHS_TB}")

        # Train
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)

        # Validate
        val_loss, val_acc, true_labels_e, pred_labels_e = evaluate(model, val_loader)

        #Append losses for plotting purposes
        TB_loss_epoch_train[x].append(train_loss)
        TB_loss_epoch_val[x].append(val_loss)

        # Track best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc

            best_true_labels = true_labels_e
            best_pred_labels = pred_labels_e
            # Optional: Save best model
            # torch.save(model.state_dict(), 'best_model_phase1.pth')

        print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
        print(f"   Val:   Loss={val_loss:.4f}, Acc={val_acc:.3f} "
          f"{' New best!' if val_acc == best_val_acc else ''}")

    print("\n Phase 1 Complete!")
    print(f"   Best validation accuracy: {best_val_acc:.3f}")

    #Best Epoch F1 Score
    macro_f1 = f1_score(best_true_labels, best_pred_labels,
                        average='macro', labels=list(range(len(full_train_ds.classes))))

    #Confustion Matrix
    conf_matrix = confusion_matrix(best_true_labels, best_pred_labels,
                                   labels=list(range(len(full_train_ds.classes))))

    print(f"Macro-F1 Score (Best Epoch): {macro_f1:.4f}")
    print("\nConfusion Matrix (True vs. Predicted):\n")
    print(conf_matrix)

# Model TC

In [None]:
#Redefine model for phase 3
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(DEVICE)


#This is going to hold the losses for the different hyper paramate combinations
TC_loss_epoch_train = {}
TC_loss_epoch_val ={}
# Hyperparameters for Phase 1

EPOCHS_TC = 3
LR_TC = [1e-3, 1e-2, .1]

print("\n" + "="*60)
print(" PHASE 1: HEAD-ONLY FINE-TUNING")
print("="*60)

# Step 1: Freeze entire model
print("\n Freezing all layers...")
set_requires_grad(model, False)

# Step 2: Unfreeze the classifier head and last 2 layers
print("\n Unfreezing classifier head...")
set_requires_grad(model.fc, True)
set_requires_grad(model.layer4, True)
set_requires_grad(model.layer3, True)

# Step 3: Create optimizer for ONLY trainable parameters
# filter() ensures we only optimize parameters with requires_grad=True

BATCH_SIZE = [32, 64, 128]

for x in range(3):
    TC_loss_epoch_train[x]=[]
    TC_loss_epoch_val[x]=[]

    train_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    val_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    if x == 0:
        optimizer = optim.Adam(trainable_params, lr=LR_TC[x])
    elif x==1:
        optimizer = optim.SGD(trainable_params, lr=LR_TC[x])

    elif x==2:
        optim.Adagrad(trainable_params, lr=LR_TC[x])
    print(f"\n Optimizer setup:")
    print(f"   Learning rate: {LR_TC[x]}")
    print(f"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Step 4: Training loop
    print("\n Training progress:")
    print("-" * 60)

    best_val_acc = 0.0
    for epoch in range(1, EPOCHS_TC + 1):
        print(f"\nEpoch {epoch}/{EPOCHS_TC}")

        # Train
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)

        # Validate
        val_loss, val_acc, true_labels_e, pred_labels_e = evaluate(model, val_loader)

        #Append losses for plotting purposes
        TC_loss_epoch_train[x].append(train_loss)
        TC_loss_epoch_val[x].append(val_loss)

        # Track best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc

            best_true_labels = true_labels_e
            best_pred_labels = pred_labels_e
            # Optional: Save best model
            # torch.save(model.state_dict(), 'best_model_phase1.pth')

        print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
        print(f"   Val:   Loss={val_loss:.4f}, Acc={val_acc:.3f} "
          f"{' New best!' if val_acc == best_val_acc else ''}")

    print("\n Phase 1 Complete!")
    print(f"   Best validation accuracy: {best_val_acc:.3f}")

    #Best Epoch F1 Score
    macro_f1 = f1_score(best_true_labels, best_pred_labels,
                        average='macro', labels=list(range(len(full_train_ds.classes))))

    #Confustion Matrix
    conf_matrix = confusion_matrix(best_true_labels, best_pred_labels,
                                   labels=list(range(len(full_train_ds.classes))))

    print(f"Macro-F1 Score (Best Epoch): {macro_f1:.4f}")
    print("\nConfusion Matrix (True vs. Predicted):\n")
    print(conf_matrix)

# Full Model

In [None]:
#Redefine model for phase 4
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(DEVICE)


#This is going to hold the losses for the different hyper paramate combinations
full_loss_epoch_train = {}
full_loss_epoch_val ={}
# Hyperparameters for Phase 1

EPOCHS_full = 3
LR_full = [1e-3, 1e-2, .1]

print("\n" + "="*60)
print(" PHASE 1: HEAD-ONLY FINE-TUNING")
print("="*60)

# Step 1: Freeze entire model
print("\n Freezing all layers...")
set_requires_grad(model, True)

# Step 2: None Frozen


# Step 3: Create optimizer for ONLY trainable parameters
# filter() ensures we only optimize parameters with requires_grad=True

BATCH_SIZE = [32, 64, 128]

for x in range(3):
    TC_loss_epoch_train[x]=[]
    TC_loss_epoch_val[x]=[]

    train_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    val_loader = DataLoader(
    train_subset_ds,
    batch_size=BATCH_SIZE[x],
    shuffle=True,
    num_workers=2,
    pin_memory=True
    )

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    if x == 0:
        optimizer = optim.Adam(trainable_params, lr=LR_full[x])
    elif x==1:
        optimizer = optim.SGD(trainable_params, lr=LR_full[x])

    elif x==2:
        optim.Adagrad(trainable_params, lr=LR_full[x])
    print(f"\n Optimizer setup:")
    print(f"   Learning rate: {LR_full[x]}")
    print(f"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Step 4: Training loop
    print("\n Training progress:")
    print("-" * 60)

    best_val_acc = 0.0
    for epoch in range(1, EPOCHS_full + 1):
        print(f"\nEpoch {epoch}/{EPOCHS_full}")

        # Train
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)

        # Validate
        val_loss, val_acc, true_labels_e, pred_labels_e = evaluate(model, val_loader)

        #Append losses for plotting purposes
        full_loss_epoch_train[x].append(train_loss)
        full_loss_epoch_val[x].append(val_loss)

        # Track best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc

            best_true_labels = true_labels_e
            best_pred_labels = pred_labels_e
            # Optional: Save best model
            # torch.save(model.state_dict(), 'best_model_phase1.pth')

        print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
        print(f"   Val:   Loss={val_loss:.4f}, Acc={val_acc:.3f} "
          f"{' New best!' if val_acc == best_val_acc else ''}")

    print("\n Phase 1 Complete!")
    print(f"   Best validation accuracy: {best_val_acc:.3f}")

    #Best Epoch F1 Score
    macro_f1 = f1_score(best_true_labels, best_pred_labels,
                        average='macro', labels=list(range(len(full_train_ds.classes))))

    #Confustion Matrix
    conf_matrix = confusion_matrix(best_true_labels, best_pred_labels,
                                   labels=list(range(len(full_train_ds.classes))))

    print(f"Macro-F1 Score (Best Epoch): {macro_f1:.4f}")
    print("\nConfusion Matrix (True vs. Predicted):\n")
    print(conf_matrix)