-
Notifications
You must be signed in to change notification settings - Fork 0
PytorchModule2
igheyas edited this page Aug 28, 2025
·
16 revisions
import torch
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
super().__init__()
self.net = nn.Sequential(
nn.Linear(in_features, hidden[0]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[0], hidden[1]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[1], num_classes)
)
self._init_weights()
def _init_weights(self):
# Kaiming init for ReLU layers; biases to 0
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
nn.init.constant_(m.bias, 0.)
def forward(self, x):
# x: (N, 1, 28, 28) → flatten to (N, 784)
x = x.view(x.size(0), -1)
return self.net(x)model = MLP()
# Inspect parameters
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parameters: {trainable:,} / {total:,}")
# Save weights (state_dict = Python dict of tensors)
torch.save(model.state_dict(), "mlp_init.pth")
# Load weights later
model.load_state_dict(torch.load("mlp_init.pth", map_location="cpu"))
import os, math, time, random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
# --- Reproducibility (good interview habit) ---
def seed_all(seed=42):
random.seed(seed); torch.manual_seed(seed)
torch.use_deterministic_algorithms(False) # keep fast paths on CPU
seed_all(42)
# --- Data: Fashion-MNIST (28x28 greyscale, 10 classes) ---
transform = transforms.Compose([
transforms.ToTensor(), # [0,1]
transforms.Normalize((0.5,), (0.5,)) # zero-mean, unit-ish variance
])
root = "./data"
full_train = datasets.FashionMNIST(root, train=True, download=True, transform=transform)
test_set = datasets.FashionMNIST(root, train=False, download=True, transform=transform)
# Split train into train/val
val_ratio = 0.1
n_val = int(len(full_train) * val_ratio)
n_train = len(full_train) - n_val
train_set, val_set = random_split(full_train, [n_train, n_val])
# Dataloaders (CPU friendly)
batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)
# --- Model, loss, optimiser ---
class MLP(nn.Module):
def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
super().__init__()
self.net = nn.Sequential(
nn.Linear(in_features, hidden[0]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[0], hidden[1]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[1], num_classes)
)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
nn.init.constant_(m.bias, 0.)
def forward(self, x):
x = x.view(x.size(0), -1)
return self.net(x)
device = torch.device("cpu")
model = MLP().to(device)
criterion = nn.CrossEntropyLoss() # expects class indices (0..9)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# Optional: LR scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)
# --- Helpers: accuracy, train/eval one epoch ---
def accuracy_from_logits(logits, y_true):
preds = logits.argmax(dim=1)
return (preds == y_true).float().mean().item()
def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
model.train()
running_loss, running_acc, n = 0.0, 0.0, 0
for x, y in loader:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
logits = model(x)
loss = criterion(logits, y)
loss.backward()
if max_grad_norm is not None:
nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
bs = x.size(0)
running_loss += loss.item() * bs
running_acc += (logits.argmax(1) == y).float().sum().item()
n += bs
return running_loss / n, running_acc / n
@torch.inference_mode()
def evaluate(model, loader, criterion):
model.eval()
running_loss, running_acc, n = 0.0, 0.0, 0
for x, y in loader:
x, y = x.to(device), y.to(device)
logits = model(x)
loss = criterion(logits, y)
bs = x.size(0)
running_loss += loss.item() * bs
running_acc += (logits.argmax(1) == y).float().sum().item()
n += bs
return running_loss / n, running_acc / n
# --- Early stopping ---
best_val_loss = float("inf")
best_path = "mlp_fmnist_best.pth"
patience = 3
min_delta = 1e-4
wait = 0
epochs = 5 # as requested
for epoch in range(1, epochs + 1):
t0 = time.time()
train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, max_grad_norm=1.0)
val_loss, val_acc = evaluate(model, val_loader, criterion)
# Scheduler step (ReduceLROnPlateau expects val metric)
scheduler.step(val_loss)
improved = (best_val_loss - val_loss) > min_delta
if improved:
best_val_loss = val_loss
torch.save(model.state_dict(), best_path)
wait = 0
else:
wait += 1
dt = time.time() - t0
lr_current = optimizer.param_groups[0]["lr"]
print(f"Epoch {epoch:02d} | {dt:5.1f}s | "
f"train: loss {train_loss:.4f}, acc {train_acc*100:5.1f}% | "
f"val: loss {val_loss:.4f}, acc {val_acc*100:5.1f}% | lr {lr_current:.3g}")
if wait >= patience:
print("Early stopping triggered.")
break
# --- Load best and evaluate on test set ---
model.load_state_dict(torch.load(best_path, map_location=device))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Test: loss {test_loss:.4f}, acc {test_acc*100:5.1f}%")
import torch
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, in_features=28*28, hidden=(256, 128), num_classes=10, p=0.2):
super().__init__()
self.net = nn.Sequential(
nn.Linear(in_features, hidden[0]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[0], hidden[1]),
nn.ReLU(),
nn.Dropout(p),
nn.Linear(hidden[1], num_classes)
)
import torch
a = torch.tensor([1, 2, 3, 4])
print(a.shape) # torch.Size([4])
print(a.numel()) # 4 (four elements total)
b = torch.ones(2, 3, 5)
print(b.shape) # torch.Size([2, 3, 5])
print(b.numel()) # 2*3*5 = 30
c = torch.randn(784, 256)
print(c.numel()) # 200,704 (this would be a Linear layer weight matrix)
Student: transforms.Compose()--is transforms a built-in class and Compose is its buil-in method?
TacherC:

Teacher C:

from torchvision import transforms
```python
transforms.Compose([...])
<img width="1508" height="336" alt="image" src="https://github.com/user-attachments/assets/6a7311a1-c968-4b60-97e2-d2512ce400aa" />
```python
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
img, label = full_train[0] # first sample
print(type(img), img.shape, label)
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
nn.init.constant_(m.bias, 0.)self.modules() (PyTorch Module method) – iterates submodules (including self).
isinstance (Python built-in function).
nn.init.kaiming_uniform_ (PyTorch initialiser; in-place) – good for ReLU nets.
nn.init.constant_ (PyTorch initialiser; in-place).
def forward(self, x):
x = x.view(x.size(0), -1)
return self.net(x)
model = MLP().to(device)
criterion = nn.CrossEntropyLoss() # expects class indices (0..9)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=False)
def accuracy_from_logits(logits, y_true):
preds = logits.argmax(dim=1)
return (preds == y_true).float().mean().item()
def train_one_epoch(model, loader, optimizer, criterion, max_grad_norm=None):
model.train()