## Architecture

```
Stem:    Conv(3→16, 3×3) + BN + ReLU             32×32
Stage 1: BasicBlock(16→16) × 2                   32×32  [residual]
Stage 2: BasicBlock(16→32, stride=2) +           16×16
         BasicBlock(32→32)                       16×16  [residual]
Stage 3: BasicBlock(32→40, stride=2) +           8×8
         BasicBlock(40→40)                       8×8  [residual]
Head:    GlobalAvgPool → Dropout(0.1) → Linear(40→10)
```

Each **BasicBlock** follows the pre-activation residual pattern:
`Conv(3×3) + BN + ReLU + Conv(3×3) + BN` with an identity or 1×1 projection shortcut.
Downsampling uses stride-2 convolution in the first block of each stage.

In [4]:
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath('shared.py')))

import torch
import torch.nn as nn
from source import (
    get_dataloaders, train, evaluate,
    count_parameters, model_size_kb, print_summary,
    evaluate_pytorch,
    SEED,
)

torch.manual_seed(SEED)
DEVICE = torch.device('cpu')
print(f'PyTorch {torch.__version__} | Device: {DEVICE}')

trainloader, testloader = get_dataloaders()

PyTorch 2.10.0 | Device: cpu


  entry = pickle.load(f, encoding="latin1")


In [3]:
class BasicBlock(nn.Module):
    """Standard residual block: Conv-BN-ReLU-Conv-BN + shortcut."""
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, stride=stride,
                               padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_ch)
        self.relu  = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_ch)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_ch != out_ch:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch),
            )

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = out + self.shortcut(x)
        return self.relu(out)


class TinyResNet(nn.Module):
    """Compact ResNet for CIFAR-10.

    ~99K parameters. Three stages of BasicBlocks (2 blocks each) with channel
    widths (16, 32, 40). Inspired by ResNet-20 but scaled down to fit <500 KB.
    """
    def __init__(self, num_classes=10):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
        )
        self.stage1 = nn.Sequential(
            BasicBlock(16, 16),
            BasicBlock(16, 16),
        )
        self.stage2 = nn.Sequential(
            BasicBlock(16, 32, stride=2),
            BasicBlock(32, 32),
        )
        self.stage3 = nn.Sequential(
            BasicBlock(32, 40, stride=2),
            BasicBlock(40, 40),
        )
        self.pool    = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(0.1)
        self.fc      = nn.Linear(40, num_classes)

    def forward(self, x):
        x = self.stem(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)


model = TinyResNet().to(DEVICE)

n  = count_parameters(model)
kb = model_size_kb(model)
print(f'Parameters: {n:,}')
print(f'Model size: {kb:.1f} KB')
assert n < 125_000 and kb < 500, 'Constraint violated before training!'

Parameters: 99,706
Model size: 393.0 KB


In [None]:
final_acc = train(
    model, trainloader, testloader, DEVICE,
    save_path='best_resnet.pth',
)
print_summary(model, final_acc, label='TinyResNet')

In [6]:
# Load best model and final evaluation
model.load_state_dict(torch.load("../models/best_resnet.pth", map_location="cpu", weights_only=True))
final_acc = evaluate_pytorch(model, testloader, DEVICE)
final_size = model_size_kb(model)

print(f"\n{'='*50}")
print(f"  Test Accuracy: {final_acc:.2f}%")
print(f"{'='*50}")


  Test Accuracy: 90.73%


**Channel width (16→32→40):** Narrower than the original ResNet-20 (16→32→64) to
respect the 125K parameter budget. The bottleneck stage uses 40 instead of 64 channels,
saving ~28K parameters while retaining enough capacity to represent complex features.

**Shortcut projection:** When channel width or stride changes between stages, a 1×1
conv + BN projects the identity to the correct shape. This adds only ~0.5K parameters
per stage transition.