Skip to content

PytorchModule2

igheyas edited this page Aug 28, 2025 · 16 revisions

Home image

import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden[0]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[1], num_classes)
        )

        self._init_weights()

    def _init_weights(self):
        # Kaiming init for ReLU layers; biases to 0
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

    def forward(self, x):
        # x: (N, 1, 28, 28) → flatten to (N, 784)
        x = x.view(x.size(0), -1)
        return self.net(x)

Parameters and state_dict

model = MLP()
# Inspect parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {trainable:,} / {total:,}")

# Save weights (state_dict = Python dict of tensors)
torch.save(model.state_dict(), "mlp_init.pth")

# Load weights later
model.load_state_dict(torch.load("mlp_init.pth", map_location="cpu"))
image image
import os, math, time, random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

# --- Reproducibility (good interview habit) ---
def seed_all(seed=42):
    random.seed(seed); torch.manual_seed(seed)
    torch.use_deterministic_algorithms(False)  # keep fast paths on CPU
seed_all(42)

# --- Data: Fashion-MNIST (28x28 greyscale, 10 classes) ---
transform = transforms.Compose([
    transforms.ToTensor(),                 # [0,1]
    transforms.Normalize((0.5,), (0.5,))   # zero-mean, unit-ish variance
])

root = "./data"
full_train = datasets.FashionMNIST(root, train=True, download=True, transform=transform)
test_set   = datasets.FashionMNIST(root, train=False, download=True, transform=transform)

# Split train into train/val
val_ratio = 0.1
n_val = int(len(full_train) * val_ratio)
n_train = len(full_train) - n_val
train_set, val_set = random_split(full_train, [n_train, n_val])

# Dataloaders (CPU friendly)
batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_set,   batch_size=batch_size, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False, num_workers=2)

# --- Model, loss, optimiser ---
class MLP(nn.Module):
    def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden[0]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[1], num_classes)
        )
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.net(x)

device = torch.device("cpu")
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()               # expects class indices (0..9)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# Optional: LR scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)

# --- Helpers: accuracy, train/eval one epoch ---
def accuracy_from_logits(logits, y_true):
    preds = logits.argmax(dim=1)
    return (preds == y_true).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
    model.train()
    running_loss, running_acc, n = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        if max_grad_norm is not None:
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

        bs = x.size(0)
        running_loss += loss.item() * bs
        running_acc  += (logits.argmax(1) == y).float().sum().item()
        n += bs

    return running_loss / n, running_acc / n

@torch.inference_mode()
def evaluate(model, loader, criterion):
    model.eval()
    running_loss, running_acc, n = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        bs = x.size(0)
        running_loss += loss.item() * bs
        running_acc  += (logits.argmax(1) == y).float().sum().item()
        n += bs
    return running_loss / n, running_acc / n

# --- Early stopping ---
best_val_loss = float("inf")
best_path = "mlp_fmnist_best.pth"
patience = 3
min_delta = 1e-4
wait = 0

epochs = 5  # as requested
for epoch in range(1, epochs + 1):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, max_grad_norm=1.0)
    val_loss, val_acc = evaluate(model, val_loader, criterion)

    # Scheduler step (ReduceLROnPlateau expects val metric)
    scheduler.step(val_loss)

    improved = (best_val_loss - val_loss) > min_delta
    if improved:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_path)
        wait = 0
    else:
        wait += 1

    dt = time.time() - t0
    lr_current = optimizer.param_groups[0]["lr"]
    print(f"Epoch {epoch:02d} | {dt:5.1f}s | "
          f"train: loss {train_loss:.4f}, acc {train_acc*100:5.1f}% | "
          f"val: loss {val_loss:.4f}, acc {val_acc*100:5.1f}% | lr {lr_current:.3g}")

    if wait >= patience:
        print("Early stopping triggered.")
        break

# --- Load best and evaluate on test set ---
model.load_state_dict(torch.load(best_path, map_location=device))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Test: loss {test_loss:.4f}, acc {test_acc*100:5.1f}%")
image

Explanation

import torch
import torch.nn as nn
image
class MLP(nn.Module):
image

5: Constructor

def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
image

6: Call to superclass constructor

super().__init__()
image

7–14: Define the network

self.net = nn.Sequential(
    nn.Linear(in_features, hidden[0]),
    nn.ReLU(),
    nn.Dropout(p),
    nn.Linear(hidden[0], hidden[1]),
    nn.ReLU(),
    nn.Dropout(p),
    nn.Linear(hidden[1], num_classes)
)
image image image image
image image

Student: what is numel()---is it a built-in function?

image
import torch

a = torch.tensor([1, 2, 3, 4])
print(a.shape)      # torch.Size([4])
print(a.numel())    # 4  (four elements total)

b = torch.ones(2, 3, 5)
print(b.shape)      # torch.Size([2, 3, 5])
print(b.numel())    # 2*3*5 = 30

c = torch.randn(784, 256)
print(c.numel())    # 200,704 (this would be a Linear layer weight matrix)
image image image image image image image image image image

Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?

TacherC: image image image image image image image image

Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?

Teacher C: image

from torchvision import transforms
image ```python transforms.Compose([...])
<img width="1508" height="336" alt="image" src="https://github.com/user-attachments/assets/6a7311a1-c968-4b60-97e2-d2512ce400aa" />

```python
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

image
img, label = full_train[0]   # first sample
print(type(img), img.shape, label)
image

Fresh explanation of the code step-by-step

image image image image image image image image image image image image image image image
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

self.modules() (PyTorch Module method) – iterates submodules (including self).

isinstance (Python built-in function).

nn.init.kaiming_uniform_ (PyTorch initialiser; in-place) – good for ReLU nets.

nn.init.constant_ (PyTorch initialiser; in-place).

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.net(x)
image image
model = MLP().to(device)
image
criterion = nn.CrossEntropyLoss()               # expects class indices (0..9)
image
optimizer = optim.Adam(model.parameters(), lr=1e-3)
image
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)
image image
def accuracy_from_logits(logits, y_true):
    preds = logits.argmax(dim=1)
    return (preds == y_true).float().mean().item()
image
def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
    model.train()

Clone this wiki locally