PytorchModule2

Home

import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden[0]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[1], num_classes)
        )

        self._init_weights()

    def _init_weights(self):
        # Kaiming init for ReLU layers; biases to 0
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

    def forward(self, x):
        # x: (N, 1, 28, 28) → flatten to (N, 784)
        x = x.view(x.size(0), -1)
        return self.net(x)

Parameters and state_dict

model = MLP()
# Inspect parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {trainable:,} / {total:,}")

# Save weights (state_dict = Python dict of tensors)
torch.save(model.state_dict(), "mlp_init.pth")

# Load weights later
model.load_state_dict(torch.load("mlp_init.pth", map_location="cpu"))

import os, math, time, random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

# --- Reproducibility (good interview habit) ---
def seed_all(seed=42):
    random.seed(seed); torch.manual_seed(seed)
    torch.use_deterministic_algorithms(False)  # keep fast paths on CPU
seed_all(42)

# --- Data: Fashion-MNIST (28x28 greyscale, 10 classes) ---
transform = transforms.Compose([
    transforms.ToTensor(),                 # [0,1]
    transforms.Normalize((0.5,), (0.5,))   # zero-mean, unit-ish variance
])

root = "./data"
full_train = datasets.FashionMNIST(root, train=True, download=True, transform=transform)
test_set   = datasets.FashionMNIST(root, train=False, download=True, transform=transform)

# Split train into train/val
val_ratio = 0.1
n_val = int(len(full_train) * val_ratio)
n_train = len(full_train) - n_val
train_set, val_set = random_split(full_train, [n_train, n_val])

# Dataloaders (CPU friendly)
batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_set,   batch_size=batch_size, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False, num_workers=2)

# --- Model, loss, optimiser ---
class MLP(nn.Module):
    def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden[0]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Linear(hidden[1], num_classes)
        )
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.net(x)

device = torch.device("cpu")
model = MLP().to(device)
criterion = nn.CrossEntropyLoss()               # expects class indices (0..9)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# Optional: LR scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)

# --- Helpers: accuracy, train/eval one epoch ---
def accuracy_from_logits(logits, y_true):
    preds = logits.argmax(dim=1)
    return (preds == y_true).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
    model.train()
    running_loss, running_acc, n = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        if max_grad_norm is not None:
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

        bs = x.size(0)
        running_loss += loss.item() * bs
        running_acc  += (logits.argmax(1) == y).float().sum().item()
        n += bs

    return running_loss / n, running_acc / n

@torch.inference_mode()
def evaluate(model, loader, criterion):
    model.eval()
    running_loss, running_acc, n = 0.0, 0.0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        bs = x.size(0)
        running_loss += loss.item() * bs
        running_acc  += (logits.argmax(1) == y).float().sum().item()
        n += bs
    return running_loss / n, running_acc / n

# --- Early stopping ---
best_val_loss = float("inf")
best_path = "mlp_fmnist_best.pth"
patience = 3
min_delta = 1e-4
wait = 0

epochs = 5  # as requested
for epoch in range(1, epochs + 1):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, max_grad_norm=1.0)
    val_loss, val_acc = evaluate(model, val_loader, criterion)

    # Scheduler step (ReduceLROnPlateau expects val metric)
    scheduler.step(val_loss)

    improved = (best_val_loss - val_loss) > min_delta
    if improved:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_path)
        wait = 0
    else:
        wait += 1

    dt = time.time() - t0
    lr_current = optimizer.param_groups[0]["lr"]
    print(f"Epoch {epoch:02d} | {dt:5.1f}s | "
          f"train: loss {train_loss:.4f}, acc {train_acc*100:5.1f}% | "
          f"val: loss {val_loss:.4f}, acc {val_acc*100:5.1f}% | lr {lr_current:.3g}")

    if wait >= patience:
        print("Early stopping triggered.")
        break

# --- Load best and evaluate on test set ---
model.load_state_dict(torch.load(best_path, map_location=device))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Test: loss {test_loss:.4f}, acc {test_acc*100:5.1f}%")

Explanation

import torch
import torch.nn as nn

class MLP(nn.Module):

5: Constructor

def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):

6: Call to superclass constructor

super().__init__()

7–14: Define the network

self.net = nn.Sequential(
    nn.Linear(in_features, hidden[0]),
    nn.ReLU(),
    nn.Dropout(p),
    nn.Linear(hidden[0], hidden[1]),
    nn.ReLU(),
    nn.Dropout(p),
    nn.Linear(hidden[1], num_classes)
)

Student: what is numel()---is it a built-in function?

import torch

a = torch.tensor([1, 2, 3, 4])
print(a.shape)      # torch.Size([4])
print(a.numel())    # 4  (four elements total)

b = torch.ones(2, 3, 5)
print(b.shape)      # torch.Size([2, 3, 5])
print(b.numel())    # 2*3*5 = 30

c = torch.randn(784, 256)
print(c.numel())    # 200,704 (this would be a Linear layer weight matrix)

Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?

TacherC:

Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?

Teacher C:

from torchvision import transforms

```python transforms.Compose([...])

<img width="1508" height="336" alt="image" src="https://github.com/user-attachments/assets/6a7311a1-c968-4b60-97e2-d2512ce400aa" />

```python
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

img, label = full_train[0]   # first sample
print(type(img), img.shape, label)

Fresh explanation of the code step-by-step

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                nn.init.constant_(m.bias, 0.)

self.modules() (PyTorch Module method) – iterates submodules (including self).

isinstance (Python built-in function).

nn.init.kaiming_uniform_ (PyTorch initialiser; in-place) – good for ReLU nets.

nn.init.constant_ (PyTorch initialiser; in-place).

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.net(x)

model = MLP().to(device)

criterion = nn.CrossEntropyLoss()               # expects class indices (0..9)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)

def accuracy_from_logits(logits, y_true):
    preds = logits.argmax(dim=1)
    return (preds == y_true).float().mean().item()

def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
    model.train()

PytorchModule2

Parameters and state_dict

Explanation

5: Constructor

6: Call to superclass constructor

7–14: Define the network

Student: what is numel()---is it a built-in function?

Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?

Fresh explanation of the code step-by-step

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!