# Data Processing for Simpsons-MNIST
This notebook loads the Simpsons-MNIST dataset, normalizes images, flattens them, and creates stratified train/validation/test splits.

In [1]:
import os
from typing import Tuple, List

import numpy as np
from PIL import Image


In [2]:
def _load_images_from_folder(folder: str, mode: str) -> np.ndarray:
    images: List[np.ndarray] = []
    for file in sorted(os.listdir(folder)):
        if not file.lower().endswith('.jpg'):
            continue
        path = os.path.join(folder, file)
        img = Image.open(path)
        if mode == 'grayscale':
            img = img.convert('L')
        else:
            img = img.convert('RGB')
        arr = np.asarray(img, dtype=np.float32) / 255.0
        images.append(arr.flatten())
    return np.stack(images, axis=0)


def load_simpsons_mnist(base_dir: str,
                        mode: str = 'rgb',
                        val_ratio: float = 0.2,
                        seed: int = 42):
    rng = np.random.default_rng(seed)
    train_dir = os.path.join(base_dir, mode, 'train')
    test_dir = os.path.join(base_dir, mode, 'test')
    classes = sorted(d for d in os.listdir(train_dir) if not d.startswith('.'))
    train_data, train_labels, test_data, test_labels = [], [], [], []
    for label, cls in enumerate(classes):
        cls_train = os.path.join(train_dir, cls)
        cls_test = os.path.join(test_dir, cls)
        imgs_train = _load_images_from_folder(cls_train, mode)
        imgs_test = _load_images_from_folder(cls_test, mode)
        train_data.append(imgs_train)
        train_labels.append(np.full(imgs_train.shape[0], label, dtype=np.int32))
        test_data.append(imgs_test)
        test_labels.append(np.full(imgs_test.shape[0], label, dtype=np.int32))
    X = np.vstack(train_data)
    y = np.concatenate(train_labels)
    X_test = np.vstack(test_data)
    y_test = np.concatenate(test_labels)
    train_indices, val_indices = [], []
    for label in np.unique(y):
        idx = np.where(y == label)[0]
        rng.shuffle(idx)
        split = int(len(idx) * (1 - val_ratio))
        train_indices.extend(idx[:split])
        val_indices.extend(idx[split:])
    X_train, y_train = X[train_indices], y[train_indices]
    X_val, y_val = X[val_indices], y[val_indices]
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), classes


In [3]:
(train_X, train_y), (val_X, val_y), (test_X, test_y), classes = load_simpsons_mnist('simpsons-mnist-0.1-rgb/dataset', mode='grayscale', val_ratio=0.1)
print(f"Train set shape: {train_X.shape}")
print(f"Validation set shape: {val_X.shape}")
print(f"Test set shape: {test_X.shape}")
print(f"Number of classes: {len(classes)}")


Train set shape: (7200, 784)
Validation set shape: (800, 784)
Test set shape: (2000, 784)
Number of classes: 10
