# Project Summary

## Dataset & Goal
- Dataset Source: RSNA Pneumonia Detection Challenge (Kaggle). This is a high-authority source (Radiological Society of North America)
- Total Images: $\sim$26,000 unique Chest X-ray images.
- Model Goal: Multi-Class Classification (3 classes), with a planned fallback to Binary Classification if performance is poor.
- Image Path Root: All images were successfully unzipped and are located in the Colab runtime environment under the folder path /content/train_images/stage_2_train_images/.

## Class Definitions & Mapping

| Class Name | Label (Target) | Pathological Status | Role in Classification |
| :--- | :--- | :--- | :--- |
| Normal | 0 | Healthy | True Negative (Healthy) |
| Lung Opacity | 1 | Pneumonia Present | True Positive (Pneumonia) |
| No Lung Opacity / Not Normal | 2 | Other Diseases/Issues | Hard Negative (Sick, but NOT Pneumonia) |

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

## Load Datasets

In [2]:
# 2. Install pydicom (needed for medical images)
!pip install pydicom

# 3. Unzip images into the local Colab environment (FAST)
!unzip -q "/content/drive/My Drive/STAT362 Final Project_RSNA/images.zip" -d "/content/train_images"

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [19]:
import pandas as pd

# Load the detailed class info
base = "/content/drive/MyDrive/STAT362 Final Project_RSNA"
detailed_class = pd.read_csv('/content/drive/My Drive/STAT362 Final Project_RSNA/stage_2_detailed_class_info.csv')
labels = pd.read_csv('/content/drive/My Drive/STAT362 Final Project_RSNA/stage_2_train_labels.csv')

## EDA

In [None]:
detailed_class.head()

Unnamed: 0,patientId,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity


In [None]:
labels.head()

Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1


In [None]:
# Merge the two datasets to explore the relationship between class and target
merge_df = pd.merge(detailed_class, labels[['patientId', 'Target']], on='patientId')
merge_df.head()

Unnamed: 0,patientId,class,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal,0
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity,1


In [None]:
merge_df.groupby(by=['class', 'Target']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,patientId
class,Target,Unnamed: 2_level_1
Lung Opacity,1,16957
No Lung Opacity / Not Normal,0,11821
Normal,0,8851


A consistent class-to-target mapping confirms that 'Lung Opacity' represents pneumonia, while 'Normal' and 'Not Normal' represent non-pneumonia cases. Consequently, we will implement a three-class CNN to distinguish between these three unique states: healthy lungs, pneumonia, and other lung pathologies.

## Process Labels (The Multi-Class Logic)

In [20]:
# 1. REMOVE DUPLICATES
# We only need one label per patientId
detailed_class = detailed_class.drop_duplicates(subset=['patientId'])

# 2. DEFINE 3-CLASS MAPPING
# Normal = 0, Pneumonia = 1, Other Disease = 2
class_mapping = {
    'Normal': 0,
    'Lung Opacity': 1,
    'No Lung Opacity / Not Normal': 2
}

detailed_class['target'] = detailed_class['class'].map(class_mapping)

# 3. Create the file path column
# The patientId in CSV does not have .dcm extension, so we add it
detailed_class['path'] = detailed_class['patientId'].apply(lambda x: f"/content/train_images/stage_2_train_images/{x}.dcm")

print(f"Total unique images: {len(detailed_class)}")
print(detailed_class['target'].value_counts())

Total unique images: 26684
target
2    11821
0     8851
1     6012
Name: count, dtype: int64


## The Custom Dataset Class

This is the most important part. This Python class tells PyTorch how to open a DICOM file and turn it into a tensor your model can understand.

In [21]:
import torch
from torch.utils.data import Dataset
import pydicom
import numpy as np
from PIL import Image
from pydicom.pixel_data_handlers.util import apply_voi_lut

class RSNADataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.loc[idx, "path"]
        label = int(self.dataframe.loc[idx, "target"])

        ds = pydicom.dcmread(img_path)
        img = ds.pixel_array.astype(np.float32)

        # apply VOI LUT (better windowing when available)
        try:
            img = apply_voi_lut(img, ds).astype(np.float32)
        except Exception:
            pass

        # handle inverted grayscale
        if getattr(ds, "PhotometricInterpretation", "") == "MONOCHROME1":
            img = img.max() - img

        # robust normalize with percentile clipping
        lo, hi = np.percentile(img, (1, 99))
        img = np.clip(img, lo, hi)
        img = (img - lo) / (hi - lo + 1e-6)  # -> [0, 1]

        # convert to 8-bit and 3-channel PIL for torchvision transforms
        img = (img * 255.0).astype(np.uint8)
        image = Image.fromarray(img).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)




In [23]:
# Define Transforms
# train: add light augmentation + simpler normalization for X-rays
import torchvision.transforms as transforms

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(7),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]),
])

# val/test: no augmentation
eval_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]),
])


## Train-Test-Validation Split



In [24]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1. Split (70/15/15) with stratify
train_df, temp_df = train_test_split(
    detailed_class, test_size=0.3, stratify=detailed_class["target"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["target"], random_state=42
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# 2. Create Datasets
train_dataset = RSNADataset(train_df, transform=train_transforms)
val_dataset   = RSNADataset(val_df,   transform=eval_transforms)
test_dataset  = RSNADataset(test_df,  transform=eval_transforms)

# 3. Create DataLoaders (set num_workers=0 to prevent worker crashes with DICOM)
BATCH_SIZE = 64  # use 32 if you run out of GPU memory

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=0, pin_memory=True)

# Quick Test
images, labels = next(iter(train_loader))
print(f"Image batch shape: {images.shape}")
print(f"Labels batch shape: {labels.shape}")


Train: 18678, Val: 4003, Test: 4003
Image batch shape: torch.Size([64, 3, 224, 224])
Labels batch shape: torch.Size([64])


MUSE TUNING

In [25]:
import torch
import torch.nn as nn
from torchvision.models import densenet121, DenseNet121_Weights

# 1. Define the Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load DenseNet121 (The "Medical Standard" model)
# We use weights=DEFAULT to load ImageNet patterns
model = densenet121(weights=DenseNet121_Weights.DEFAULT)

# 3. Modify the Head for 3 Classes
# DenseNet's classifier is called 'classifier' (ResNet's was 'fc')
num_features = model.classifier.in_features

model.classifier = nn.Sequential(
    nn.Linear(num_features, 512),
    nn.ReLU(),
    nn.Dropout(0.4),  # added dropout to reduce overfitting
    nn.Linear(512, 3) # Output 3 classes
)

# 4. Move to GPU
model = model.to(device)

print("Model: DenseNet121 initialized and unfrozen.")

Using device: cuda
Model: DenseNet121 initialized and unfrozen.


In [26]:
import torch.optim as optim

# --- CONFIGURATION ---
# Lower LR because we are training the WHOLE model now
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-4

# --- OPTIMIZER ---
# We pass model.parameters() to train everything (UNFROZEN)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# --- LOSS FUNCTION ---
# Recalculate weights just to be safe
class_counts = train_df["target"].value_counts().sort_index().values
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float32)
class_weights = (class_weights / class_weights.sum()) * len(class_counts)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

# --- SCHEDULER ---
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.5, patience=1
)

In [27]:
import copy
from tqdm import tqdm

# --- TRAINING LOOP ---
num_epochs = 10
patience = 3

best_val_acc = -1.0
best_state_dict = None
epochs_no_improve = 0

scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    # Optional: Wrap loader with tqdm for a progress bar
    loop = tqdm(train_loader, leave=True)
    for images, labels in loop:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)
        loop.set_postfix(loss=loss.item())

    train_loss /= train_total
    train_acc = train_correct / train_total

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    scheduler.step(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        # This line was causing the error:
        best_state_dict = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    print(
        f"Epoch [{epoch+1}/{num_epochs}] "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} "
        f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}"
    )

    if epochs_no_improve >= patience:
        print("Early stopping triggered")
        break

if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    print(f"Loaded best model with Val Acc = {best_val_acc:.4f}")

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
100%|██████████| 292/292 [04:43<00:00,  1.03it/s, loss=0.579]
  with torch.cuda.amp.autocast():


Epoch [1/10] Train Loss: 0.6961 | Train Acc: 0.6651 | Val Loss: 0.6309 | Val Acc: 0.6995


100%|██████████| 292/292 [04:36<00:00,  1.06it/s, loss=0.583]


Epoch [2/10] Train Loss: 0.6138 | Train Acc: 0.7071 | Val Loss: 0.6212 | Val Acc: 0.6872


100%|██████████| 292/292 [04:38<00:00,  1.05it/s, loss=0.558]


Epoch [3/10] Train Loss: 0.5788 | Train Acc: 0.7242 | Val Loss: 0.6143 | Val Acc: 0.7160


100%|██████████| 292/292 [04:34<00:00,  1.06it/s, loss=0.51]


Epoch [4/10] Train Loss: 0.5509 | Train Acc: 0.7373 | Val Loss: 0.6420 | Val Acc: 0.6717


100%|██████████| 292/292 [04:35<00:00,  1.06it/s, loss=0.474]


Epoch [5/10] Train Loss: 0.5259 | Train Acc: 0.7520 | Val Loss: 0.6432 | Val Acc: 0.6927


100%|██████████| 292/292 [04:33<00:00,  1.07it/s, loss=0.452]


Epoch [6/10] Train Loss: 0.4676 | Train Acc: 0.7786 | Val Loss: 0.6308 | Val Acc: 0.6960
Early stopping triggered
Loaded best model with Val Acc = 0.7160


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import densenet121, DenseNet121_Weights
import copy
from tqdm import tqdm

# 1. Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load DenseNet121
model = densenet121(weights=DenseNet121_Weights.DEFAULT)

# 3. Simpler, More Effective Head
# Your original was actually good - just needs minor tweaks
num_features = model.classifier.in_features
model.classifier = nn.Sequential(
    nn.Linear(num_features, 512),
    nn.ReLU(),
    nn.Dropout(0.45),  # Slightly increased from 0.4
    nn.Linear(512, 3)
)

model = model.to(device)

# 4. Configuration
LEARNING_RATE = 5e-5      # Slightly higher than before
WEIGHT_DECAY = 5e-4       # Moderate regularization
NUM_EPOCHS = 12
PATIENCE = 3
WARMUP_EPOCHS = 2

# 5. Smarter Class Weights
# Analyzing your confusion: Class 1→2 is your biggest issue (273 errors)
# Class 2→0 is second (329 errors)
class_counts = train_df["target"].value_counts().sort_index().values
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float32)
class_weights = (class_weights / class_weights.sum()) * len(class_counts)

# Fine-tune weights based on confusion patterns
class_weights[1] *= 1.4  # Boost class 1 (most confused with class 2)
class_weights[2] *= 1.2  # Boost class 2 moderately

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
print(f"Class weights: {class_weights}")

# 6. Two-Stage Training Strategy
# Stage 1: Train only the head (freeze backbone)
for param in model.features.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE * 10,  # Higher LR for head-only training
    weight_decay=WEIGHT_DECAY
)

# 7. OneCycleLR Scheduler (often better than Cosine for medical imaging)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=LEARNING_RATE * 10,
    total_steps=total_steps,
    pct_start=0.3,
    anneal_strategy='cos'
)

# 8. Training Loop
best_val_acc = -1.0
best_balanced_acc = -1.0
best_state_dict = None
epochs_no_improve = 0
backbone_unfrozen = False

scaler = torch.cuda.amp.GradScaler()

for epoch in range(NUM_EPOCHS):
    # Unfreeze backbone after warmup
    if epoch == WARMUP_EPOCHS and not backbone_unfrozen:
        print(f"\n{'='*60}")
        print("UNFREEZING BACKBONE - Fine-tuning entire model")
        print(f"{'='*60}\n")

        for param in model.features.parameters():
            param.requires_grad = True

        # Reset optimizer with lower LR for full model
        optimizer = optim.AdamW([
            {'params': model.features.parameters(), 'lr': LEARNING_RATE * 0.1},
            {'params': model.classifier.parameters(), 'lr': LEARNING_RATE}
        ], weight_decay=WEIGHT_DECAY)

        # Reset scheduler
        remaining_steps = len(train_loader) * (NUM_EPOCHS - epoch)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=LEARNING_RATE,
            total_steps=remaining_steps,
            pct_start=0.2,
            anneal_strategy='cos'
        )
        backbone_unfrozen = True

    # Training Phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for images, labels in loop:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # Step per batch for OneCycleLR

        train_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)

        # Update progress bar
        current_lr = optimizer.param_groups[0]['lr']
        loop.set_postfix(loss=loss.item(), lr=f"{current_lr:.2e}")

    train_loss /= train_total
    train_acc = train_correct / train_total

    # Validation Phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    all_preds = []
    all_labels = []
    class_correct = [0, 0, 0]
    class_total = [0, 0, 0]

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # Per-class accuracy
            for i in range(3):
                mask = labels == i
                class_correct[i] += (preds[mask] == labels[mask]).sum().item()
                class_total[i] += mask.sum().item()

    val_loss /= val_total
    val_acc = val_correct / val_total

    # Calculate balanced accuracy
    from sklearn.metrics import balanced_accuracy_score
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)

    # Per-class recall
    class_recalls = [class_correct[i] / class_total[i] if class_total[i] > 0 else 0
                     for i in range(3)]

    # Model selection: prioritize balanced accuracy + minimum per-class recall
    min_recall = min(class_recalls)
    selection_metric = balanced_acc * 0.7 + min_recall * 0.3  # Weighted combo

    if selection_metric > best_balanced_acc:
        best_balanced_acc = selection_metric
        best_val_acc = val_acc
        best_state_dict = copy.deepcopy(model.state_dict())
        epochs_no_improve = 0
        print(f"✓ New best model! (metric: {selection_metric:.4f})")
    else:
        epochs_no_improve += 1

    print(
        f"Epoch [{epoch+1}/{NUM_EPOCHS}] "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} "
        f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}\n"
        f"  Balanced Acc: {balanced_acc:.4f} | "
        f"Class Recalls: [{class_recalls[0]:.3f}, {class_recalls[1]:.3f}, {class_recalls[2]:.3f}]"
    )

    if epochs_no_improve >= PATIENCE:
        print(f"Early stopping triggered after {epoch+1} epochs")
        break

# Load best model
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    print(f"\n{'='*60}")
    print(f"Loaded best model with selection metric: {best_balanced_acc:.4f}")
    print(f"{'='*60}")

Using device: cuda


  scaler = torch.cuda.amp.GradScaler()


Class weights: tensor([0.9313, 1.9198, 0.8369])


  with torch.cuda.amp.autocast():
Epoch 1/12: 100%|██████████| 292/292 [04:35<00:00,  1.06it/s, loss=0.917, lr=1.06e-04]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.5421)
Epoch [1/12] Train Loss: 0.9823 | Train Acc: 0.4497 | Val Loss: 0.8570 | Val Acc: 0.5608
  Balanced Acc: 0.6013 | Class Recalls: [0.625, 0.775, 0.404]


  with torch.cuda.amp.autocast():
Epoch 2/12: 100%|██████████| 292/292 [04:32<00:00,  1.07it/s, loss=0.771, lr=3.02e-04]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.5448)
Epoch [2/12] Train Loss: 0.8277 | Train Acc: 0.5669 | Val Loss: 0.7968 | Val Acc: 0.5908
  Balanced Acc: 0.6324 | Class Recalls: [0.839, 0.717, 0.340]

UNFREEZING BACKBONE - Fine-tuning entire model



  with torch.cuda.amp.autocast():
Epoch 3/12: 100%|██████████| 292/292 [04:33<00:00,  1.07it/s, loss=0.565, lr=2.61e-05]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.5726)
Epoch [3/12] Train Loss: 0.7587 | Train Acc: 0.6116 | Val Loss: 0.7126 | Val Acc: 0.6153
  Balanced Acc: 0.6605 | Class Recalls: [0.836, 0.778, 0.368]


  with torch.cuda.amp.autocast():
Epoch 4/12: 100%|██████████| 292/292 [04:32<00:00,  1.07it/s, loss=0.719, lr=5.00e-05]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.6576)
Epoch [4/12] Train Loss: 0.6765 | Train Acc: 0.6685 | Val Loss: 0.6605 | Val Acc: 0.6802
  Balanced Acc: 0.6920 | Class Recalls: [0.817, 0.682, 0.577]


  with torch.cuda.amp.autocast():
Epoch 5/12: 100%|██████████| 292/292 [04:34<00:00,  1.06it/s, loss=0.594, lr=4.81e-05]
  with torch.cuda.amp.autocast():


Epoch [5/12] Train Loss: 0.6298 | Train Acc: 0.6981 | Val Loss: 0.6591 | Val Acc: 0.6767
  Balanced Acc: 0.6977 | Class Recalls: [0.823, 0.729, 0.541]


  with torch.cuda.amp.autocast():
Epoch 6/12: 100%|██████████| 292/292 [04:32<00:00,  1.07it/s, loss=0.632, lr=4.27e-05]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.6594)
Epoch [6/12] Train Loss: 0.5948 | Train Acc: 0.7111 | Val Loss: 0.6437 | Val Acc: 0.6787
  Balanced Acc: 0.7002 | Class Recalls: [0.777, 0.759, 0.564]


  with torch.cuda.amp.autocast():
Epoch 7/12: 100%|██████████| 292/292 [04:33<00:00,  1.07it/s, loss=0.512, lr=3.45e-05]
  with torch.cuda.amp.autocast():


Epoch [7/12] Train Loss: 0.5560 | Train Acc: 0.7367 | Val Loss: 0.6477 | Val Acc: 0.6660
  Balanced Acc: 0.6985 | Class Recalls: [0.823, 0.784, 0.489]


  with torch.cuda.amp.autocast():
Epoch 8/12: 100%|██████████| 292/292 [04:34<00:00,  1.06it/s, loss=0.661, lr=2.50e-05]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.6828)
Epoch [8/12] Train Loss: 0.5215 | Train Acc: 0.7513 | Val Loss: 0.6632 | Val Acc: 0.7080
  Balanced Acc: 0.7121 | Class Recalls: [0.873, 0.649, 0.614]


  with torch.cuda.amp.autocast():
Epoch 9/12: 100%|██████████| 292/292 [04:34<00:00,  1.06it/s, loss=0.544, lr=1.54e-05]
  with torch.cuda.amp.autocast():


Epoch [9/12] Train Loss: 0.4803 | Train Acc: 0.7789 | Val Loss: 0.6687 | Val Acc: 0.7005
  Balanced Acc: 0.7090 | Class Recalls: [0.821, 0.691, 0.615]


  with torch.cuda.amp.autocast():
Epoch 10/12: 100%|██████████| 292/292 [04:35<00:00,  1.06it/s, loss=0.501, lr=7.30e-06]
  with torch.cuda.amp.autocast():


✓ New best model! (metric: 0.6861)
Epoch [10/12] Train Loss: 0.4375 | Train Acc: 0.7989 | Val Loss: 0.6854 | Val Acc: 0.7080
  Balanced Acc: 0.7164 | Class Recalls: [0.844, 0.690, 0.616]


  with torch.cuda.amp.autocast():
Epoch 11/12: 100%|██████████| 292/292 [04:33<00:00,  1.07it/s, loss=0.478, lr=1.89e-06]
  with torch.cuda.amp.autocast():


Epoch [11/12] Train Loss: 0.4062 | Train Acc: 0.8170 | Val Loss: 0.7066 | Val Acc: 0.7055
  Balanced Acc: 0.7091 | Class Recalls: [0.847, 0.655, 0.625]


  with torch.cuda.amp.autocast():
Epoch 12/12: 100%|██████████| 292/292 [04:33<00:00,  1.07it/s, loss=0.307, lr=2.23e-10]
  with torch.cuda.amp.autocast():


Epoch [12/12] Train Loss: 0.3956 | Train Acc: 0.8198 | Val Loss: 0.7047 | Val Acc: 0.7020
  Balanced Acc: 0.7067 | Class Recalls: [0.830, 0.665, 0.625]

Loaded best model with selection metric: 0.6861


In [32]:
# Evaluating on test dataset (with extra metrics)

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score

model.eval()
test_loss = 0.0
test_total = 0

all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        outputs = model(images)
        loss = criterion(outputs, labels)

        test_loss += loss.item() * images.size(0)
        test_total += labels.size(0)

        preds = outputs.argmax(dim=1)
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

test_loss /= test_total

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

test_acc = (all_preds == all_labels).mean()
bal_acc = balanced_accuracy_score(all_labels, all_preds)
cm = confusion_matrix(all_labels, all_preds)

print(f"  Test Loss:          {test_loss:.4f}")
print(f"  Test Accuracy:      {test_acc:.4f}")
print(f"  Balanced Accuracy:  {bal_acc:.4f}")
print("  Confusion Matrix:\n", cm)
print("\n  Classification Report:\n")
print(classification_report(all_labels, all_preds, digits=4))


  Test Loss:          0.6492
  Test Accuracy:      0.7162
  Balanced Accuracy:  0.7228
  Confusion Matrix:
 [[1148   13  167]
 [  27  614  261]
 [ 264  404 1105]]

  Classification Report:

              precision    recall  f1-score   support

           0     0.7978    0.8645    0.8298      1328
           1     0.5955    0.6807    0.6353       902
           2     0.7208    0.6232    0.6685      1773

    accuracy                         0.7162      4003
   macro avg     0.7047    0.7228    0.7112      4003
weighted avg     0.7181    0.7162    0.7145      4003



# CNN Model

## Instruction for building the model
Your crucial data preparation steps are now complete, and the data is ready for modeling. Your immediate focus should be on defining and implementing the custom CNN architecture. Use the provided train_loader to feed batches of data into your model, define your loss function as nn.CrossEntropyLoss() (since we're doing 3-class classification), and start building the training loop on the GPU (cuda). Remember that your final layer must output 3 neurons to match the target labels (0, 1, 2). Once the base model is training successfully, you can begin the process of iteration and architectural refinement.

In [None]:
#NEW CNN -> TEST ACCURACY 0.715
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

N_CLASSES = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# Improved CNN
class RSNACNN(nn.Module):
    def __init__(self, num_classes=N_CLASSES):
        super().__init__()

        def conv_block(in_channels, out_channels, dropout=0.15):
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Dropout2d(dropout),
                nn.MaxPool2d(2)
            )

        # Deeper network with more filters
        self.features = nn.Sequential(
            conv_block(3, 64, dropout=0.1),      # 224->112: More filters in first layer
            conv_block(64, 128, dropout=0.15),   # 112->56
            conv_block(128, 256, dropout=0.2),   # 56->28
            conv_block(256, 512, dropout=0.25),  # 28->14: Added 5th layer
            conv_block(512, 512, dropout=0.3)    # 14->7: Keep channels, add depth
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Bigger classifier with residual-like connection
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.constant_(m.bias, 0)


model = RSNACNN().to(device)
print(model)

# Better optimizer and learning rate
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Label smoothing helps generalization
optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)  # AdamW better than Adam

# Better scheduler - cosine annealing with warmup
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=5, T_mult=2, eta_min=1e-6
)

Using device: cuda
RSNACNN(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): Dropout2d(p=0.1, inplace=False)
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (4): BatchNorm2d(128, eps=1e-05, momentum

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Activation, BatchNormalization, MaxPool2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import GlobalAveragePooling2D
import tensorflow as tf

def create_model():
  model = keras.models.Sequential([
  # start building the model here

  keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(224, 224, 3)),
  keras.layers.MaxPooling2D (2,2),
  keras.layers.Conv2D(64, (3,3), activation='relu'),
  keras.layers.MaxPooling2D(2,2),

  keras.layers.Flatten(),
  keras.layers.Dense (128, activation='relu'),
  keras.layers.Dense(10, activation='softmax')

  # end building the model here
  ])

  # Compile with appropriate loss for multi-class classification
  model.compile(
      optimizer='adam',
      loss='sparse_categorical_crossentropy',
      metrics= ['accuracy'])
  return model

In [None]:
# create the untrained model
model = create_model()

# Display model architecture
model.summary()

# Count parameters
model.count_params()

# Train the model
history = model.fit(train_dataset, epochs = 20, validation_data = val_dataset)

NameError: name 'keras' is not defined

In [None]:
# Improved CNN Architecture - Test Accuracy: 0.715
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

N_CLASSES = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class RSNACNN(nn.Module):
    """Enhanced CNN with deeper architecture and regularization."""

    def __init__(self, num_classes=N_CLASSES):
        super().__init__()

        def conv_block(in_channels, out_channels, dropout=0.15):
            """Convolutional block with BatchNorm, ReLU, and dropout."""
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Dropout2d(dropout),
                nn.MaxPool2d(kernel_size=2, stride=2)
            )

        # Progressive feature extraction: 64 -> 128 -> 256 -> 512 -> 512
        self.features = nn.Sequential(
            conv_block(3, 64, dropout=0.1),      # 224->112
            conv_block(64, 128, dropout=0.15),   # 112->56
            conv_block(128, 256, dropout=0.2),   # 56->28
            conv_block(256, 512, dropout=0.25),  # 28->14
            conv_block(512, 512, dropout=0.3)    # 14->7
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Two-layer classifier with dropout and batch normalization
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        """Kaiming initialization for better convergence."""
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.constant_(m.bias, 0)


# Initialize model
model = RSNACNN().to(device)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model initialized with {total_params:,} trainable parameters\n")
print(model)

# Training configuration
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)

# Cosine annealing with warm restarts for adaptive learning rate
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=5,        # Initial restart period
    T_mult=2,     # Multiply period after each restart
    eta_min=1e-6  # Minimum learning rate
)

Using device: cuda
Model initialized with 9,540,547 trainable parameters

RSNACNN(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): Dropout2d(p=0.1, inplace=False)
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias

In [None]:
#OPTIMIZED TRAINING LOOP
from torch.cuda.amp import autocast, GradScaler

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=1e-6)

# Mixed precision for speed
scaler = GradScaler()

num_epochs = 30  # Increased since we have better regularization
best_val_acc = 0
patience = 7  # More patience with better scheduler
epochs_no_improve = 0
best_state_dict = None

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        # Mixed precision forward pass
        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        # Mixed precision backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_acc = correct / total

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)

            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_acc = val_correct / val_total

    # Step scheduler every epoch
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Train Acc: {train_acc:.4f} | "
          f"Val Acc: {val_acc:.4f} | "
          f"LR: {current_lr:.6f}")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = model.state_dict().copy()
        epochs_no_improve = 0
        print(f"  ✓ New best validation accuracy: {best_val_acc:.4f}")
    else:
        epochs_no_improve += 1

    # Early stopping
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

# Load and save best model
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    torch.save(model.state_dict(), "best_cnn_model.pth")
    print(f"Best model saved with validation accuracy: {best_val_acc:.4f}")
else:
    print("No improvement, model not saved.")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1/30 | Train Loss: 1.3295 | Train Acc: 0.4244 | Val Acc: 0.5224 | LR: 0.000452
  ✓ New best validation accuracy: 0.5224
Epoch 2/30 | Train Loss: 1.0988 | Train Acc: 0.4804 | Val Acc: 0.5453 | LR: 0.000328
  ✓ New best validation accuracy: 0.5453
Epoch 3/30 | Train Loss: 1.0182 | Train Acc: 0.5124 | Val Acc: 0.5728 | LR: 0.000173
  ✓ New best validation accuracy: 0.5728
Epoch 4/30 | Train Loss: 0.9768 | Train Acc: 0.5476 | Val Acc: 0.6010 | LR: 0.000049
  ✓ New best validation accuracy: 0.6010
Epoch 5/30 | Train Loss: 0.9553 | Train Acc: 0.5683 | Val Acc: 0.6138 | LR: 0.000500
  ✓ New best validation accuracy: 0.6138
Epoch 6/30 | Train Loss: 0.9559 | Train Acc: 0.5611 | Val Acc: 0.6213 | LR: 0.000488
  ✓ New best validation accuracy: 0.6213
Epoch 7/30 | Train Loss: 0.9140 | Train Acc: 0.5892 | Val Acc: 0.6298 | LR: 0.000452
  ✓ New best validation accuracy: 0.6298
Epoch 8/30 | Train Loss: 0.8840 | Train Acc: 0.6145 | Val Acc: 0.6550 | LR: 0.000397
  ✓ New best validation accuracy:

In [None]:
#OG training loop -> WORKs
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)


num_epochs = 25
best_val_acc = 0
patience = 5
epochs_no_improve = 0
best_state_dict = None

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_acc = correct / total

    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_acc = val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Train Acc: {train_acc:.4f} | "
          f"Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state_dict = model.state_dict()
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered.")
        break


if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    torch.save(model.state_dict(), "best_cnn_model.pth")
    print("Best model saved.")
else:
    print("No improvement, model not saved.")

KeyboardInterrupt: 

In [None]:
# Evaluating on test dataset

model.eval()
test_correct = 0
test_total = 0
test_loss = 0.0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        test_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)

test_loss = test_loss / test_total
test_acc = test_correct / test_total

print("  Test Loss:     {:.4f}".format(test_loss))
print("  Test Accuracy: {:.4f}".format(test_acc))


  Test Loss:     0.8254
  Test Accuracy: 0.6008


In [None]:
for name, param in best_state_dict.items():
    print(f"{name}: {param.shape}")

features.0.0.weight: torch.Size([64, 3, 3, 3])
features.0.1.weight: torch.Size([64])
features.0.1.bias: torch.Size([64])
features.0.1.running_mean: torch.Size([64])
features.0.1.running_var: torch.Size([64])
features.0.1.num_batches_tracked: torch.Size([])
features.0.3.weight: torch.Size([64, 64, 3, 3])
features.0.4.weight: torch.Size([64])
features.0.4.bias: torch.Size([64])
features.0.4.running_mean: torch.Size([64])
features.0.4.running_var: torch.Size([64])
features.0.4.num_batches_tracked: torch.Size([])
features.1.0.weight: torch.Size([128, 64, 3, 3])
features.1.1.weight: torch.Size([128])
features.1.1.bias: torch.Size([128])
features.1.1.running_mean: torch.Size([128])
features.1.1.running_var: torch.Size([128])
features.1.1.num_batches_tracked: torch.Size([])
features.1.3.weight: torch.Size([128, 128, 3, 3])
features.1.4.weight: torch.Size([128])
features.1.4.bias: torch.Size([128])
features.1.4.running_mean: torch.Size([128])
features.1.4.running_var: torch.Size([128])
feature