In [None]:
# gtsrb_loader.py
# DL-ready GTSRB Dataset + DataLoader (on-the-fly loading, with resizing via transforms)

from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


class GTSRBDataset(Dataset):
    """
    Expects this structure:

    root/
      Final_Training_Images/
        00000/*.ppm
        ...
      Final_Test_Images/
        *.ppm
        GT-final_test.csv   (must contain Filename and ClassId)

    Returns: (img_tensor, label_tensor)
    - img_tensor: float32 (3,H,W) in [0,1] (and optionally normalized if you add Normalize in transforms)
    - label_tensor: int64
    """
    def __init__(self, root: str, split: str = "train", transform=None):
        self.root = Path(root)
        self.split = split.lower().strip()
        self.transform = transform

        if self.split == "train":
            base = self.root / "Final_Training_Images"
            if not base.exists():
                raise FileNotFoundError(base)

            paths, labels = [], []
            for class_dir in sorted([d for d in base.iterdir() if d.is_dir()]):
                y = int(class_dir.name)  # robust label
                for p in sorted(class_dir.glob("*.ppm")):
                    paths.append(p)
                    labels.append(y)

            self.paths = paths
            self.labels = np.asarray(labels, dtype=np.int64)

        elif self.split == "test":
            base = self.root / "Final_Test_Images"
            csv_path = base / "GT-final_test.csv"
            if not base.exists():
                raise FileNotFoundError(base)
            if not csv_path.exists():
                raise FileNotFoundError(csv_path)

            df = pd.read_csv(csv_path, sep=None, engine="python")
            df.columns = [c.strip() for c in df.columns]
            if "Filename" not in df.columns or "ClassId" not in df.columns:
                raise ValueError(f"Need Filename and ClassId, got {df.columns.tolist()}")

            self.paths = [base / fn for fn in df["Filename"].astype(str).tolist()]
            self.labels = df["ClassId"].astype(int).to_numpy(dtype=np.int64)

        else:
            raise ValueError("split must be 'train' or 'test'")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx: int):
        p = self.paths[idx]
        y = int(self.labels[idx])

        with Image.open(p) as im:
            im = im.convert("RGB")
            if self.transform is not None:
                x = self.transform(im)
            else:
                # default: tensor in [0,1], shape (3,H,W) without resizing
                x = torch.from_numpy(np.array(im)).permute(2, 0, 1).float() / 255.0

        return x, torch.tensor(y, dtype=torch.long)


def get_gtsrb_dataloaders(
    root: str,
    img_size=(32, 32),
    batch_size: int = 128,
    num_workers: int = 0,          # Windows default: 0 is often fastest
    normalize: bool = False,
):
    """
    Returns (train_loader, test_loader) for GTSRB.
    Resizing is done via transforms.Resize(img_size).
    """
    tfms = [transforms.Resize(img_size), transforms.ToTensor()]  # -> float in [0,1]

    if normalize:
        # Placeholder normalization. If you do transfer learning, use ImageNet stats.
        # For training from scratch, you can compute dataset mean/std later.
        mean = (0.5, 0.5, 0.5)
        std  = (0.5, 0.5, 0.5)
        tfms.append(transforms.Normalize(mean, std))

    train_tfm = transforms.Compose(tfms)
    test_tfm  = transforms.Compose(tfms)

    train_ds = GTSRBDataset(root=root, split="train", transform=train_tfm)
    test_ds  = GTSRBDataset(root=root, split="test",  transform=test_tfm)

    pin = torch.cuda.is_available()

    train_loader = DataLoader(
        train_ds, batch_size=batch_size, shuffle=True,
        num_workers=num_workers, pin_memory=pin
    )
    test_loader = DataLoader(
        test_ds, batch_size=batch_size * 2, shuffle=False,
        num_workers=num_workers, pin_memory=pin
    )

    return train_loader, test_loader

In [4]:
train_loader, test_loader = get_gtsrb_dataloaders(
    root="./data/GTSRB",
    img_size=(64, 64),
    batch_size=128,
    num_workers=0,
    normalize=False,
)

x, y = next(iter(train_loader))
print("TRAIN:", x.shape, y.shape, float(x.min()), float(x.max()))

x2, y2 = next(iter(test_loader))
print("TEST :", x2.shape, y2.shape, float(x2.min()), float(x2.max()))

TRAIN: torch.Size([128, 3, 64, 64]) torch.Size([128]) 0.0 1.0
TEST : torch.Size([256, 3, 64, 64]) torch.Size([256]) 0.0 1.0


In [11]:
# load_all_gtsrb.py
# Loads ALL GTSRB images (train + test) through your DL loaders, so you can see the runtime.

import torch

# assumes you have get_gtsrb_dataloaders available (from the gtsrb_loader.py I gave you)
# from gtsrb_loader import get_gtsrb_dataloaders

def load_all_gtsrb(root="./data/GTSRB", img_size=(64, 64), batch_size=256, num_workers=0):
    train_loader, test_loader = get_gtsrb_dataloaders(
        root=root,
        img_size=img_size,
        batch_size=batch_size,
        num_workers=num_workers,
        normalize=False,
    )

    n_train = 0
    for xb, yb in train_loader:
        # xb: (B,3,H,W) float in [0,1]  | yb: (B,)
        n_train += xb.size(0)
    print("Loaded train images:", n_train)

    n_test = 0
    for xb, yb in test_loader:
        n_test += xb.size(0)
    print("Loaded test images :", n_test)


load_all_gtsrb(
    root="./data/GTSRB",   # adjust if needed
    img_size=(64, 64),     # try (32,32), (64,64), (224,224)
    batch_size=256,
    num_workers=0,
)


Loaded train images: 39209
Loaded test images : 12630


In [7]:
from pathlib import Path
import torch
from datetime import datetime

# assumes you have get_gtsrb_dataloaders available
# from gtsrb_loader import get_gtsrb_dataloaders

def _cache_split(loader, out_x: Path, out_y: Path):
    xs, ys = [], []
    for xb, yb in loader:
        xs.append(xb.cpu())
        ys.append(yb.cpu())
    X = torch.cat(xs, dim=0)
    y = torch.cat(ys, dim=0)
    torch.save(X, out_x)
    torch.save(y, out_y)
    return X.shape[0]

def build_gtsrb_cache(
    root: str = "./data/GTSRB",
    img_size=(32, 32),
    batch_size: int = 256,
    num_workers: int = 0,
    cache_root: str = "./cache_gtsrb",
):
    # variant-specific folder name
    variant_name = f"{img_size[0]}x{img_size[1]}"
    cache_dir = Path(cache_root) / f"gtsrb_{variant_name}"
    cache_dir.mkdir(parents=True, exist_ok=True)

    train_loader, test_loader = get_gtsrb_dataloaders(
        root=root,
        img_size=img_size,
        batch_size=batch_size,
        num_workers=num_workers,
        normalize=False,   # cache WITHOUT normalization
    )

    n_train = _cache_split(train_loader, cache_dir / "X_train.pt", cache_dir / "y_train.pt")
    n_test  = _cache_split(test_loader,  cache_dir / "X_test.pt",  cache_dir / "y_test.pt")

    meta = {
        "dataset": "GTSRB",
        "root": str(root),
        "img_size": tuple(img_size),
        "cached_normalized": False,
        "dtype": "float32 in [0,1]",
        "n_train": int(n_train),
        "n_test": int(n_test),
        "created_at": datetime.now().isoformat(timespec="seconds"),
    }
    torch.save(meta, cache_dir / "meta.pt")

    print(f"Saved cache to: {cache_dir}")
    print(f"Meta: {meta}")


In [None]:
build_gtsrb_cache(img_size=(32, 32))

Saved cache to: cache_gtsrb\gtsrb_32x32
Meta: {'dataset': 'GTSRB', 'root': './data/GTSRB', 'img_size': (32, 32), 'cached_normalized': False, 'dtype': 'float32 in [0,1]', 'n_train': 39209, 'n_test': 12630, 'created_at': '2026-01-24T13:26:53'}


In [9]:
build_gtsrb_cache(img_size=(64, 64))

Saved cache to: cache_gtsrb\gtsrb_64x64
Meta: {'dataset': 'GTSRB', 'root': './data/GTSRB', 'img_size': (64, 64), 'cached_normalized': False, 'dtype': 'float32 in [0,1]', 'n_train': 39209, 'n_test': 12630, 'created_at': '2026-01-24T13:27:25'}


In [12]:
from pathlib import Path
p = Path("./cache_gtsrb/gtsrb_64x64/X_train.pt")
print(p.exists(), p.stat().st_size)


True 1927202345
