# Module 2 — Experiment 1: Regularization & Batch Normalization (MNIST)

**Question (L3 – Apply):**  
Implement and compare the effects of **L2 regularization**, **Dropout**, and **Batch Normalization** in a deep neural network. Train on **MNIST**, and analyze how these techniques impact **overfitting**, **training stability**, and **generalization**.

**Learning Targets:** Regularization, Dropout, BatchNorm, Overfitting/Generalization, Validation curves.


In [None]:
# Imports & data
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype('float32')/255.0
x_test  = x_test.astype('float32')/255.0

# Add channel dimension for Conv layers
x_train = x_train[..., None]
x_test  = x_test[..., None]

print('Train:', x_train.shape, y_train.shape, '| Test:', x_test.shape, y_test.shape)

In [None]:
# Common utility: plot history
def plot_histories(histories, metric='val_accuracy', title='Validation Accuracy'):
    fig, ax = plt.subplots()
    for name, h in histories.items():
        ax.plot(h.history[metric], label=name)
    ax.set_xlabel('Epoch')
    ax.set_ylabel(metric.replace('_',' ').title())
    ax.set_title(title)
    ax.legend()
    plt.show()

In [None]:
# Build model variants
from tensorflow.keras import layers, models, regularizers

def make_baseline():
    m = models.Sequential([
        layers.Conv2D(32, 3, activation='relu', input_shape=(28,28,1)),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3, activation='relu'),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return m

def make_l2(l2=1e-4):
    m = models.Sequential([
        layers.Conv2D(32, 3, activation='relu', kernel_regularizer=regularizers.l2(l2), input_shape=(28,28,1)),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3, activation='relu', kernel_regularizer=regularizers.l2(l2)),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(l2)),
        layers.Dense(10, activation='softmax')
    ])
    m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return m

def make_dropout(p=0.5):
    m = models.Sequential([
        layers.Conv2D(32, 3, activation='relu', input_shape=(28,28,1)),
        layers.MaxPooling2D(),
        layers.Dropout(p),
        layers.Conv2D(64, 3, activation='relu'),
        layers.MaxPooling2D(),
        layers.Dropout(p),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(p),
        layers.Dense(10, activation='softmax')
    ])
    m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return m

def make_batchnorm():
    m = models.Sequential([
        layers.Conv2D(32, 3, input_shape=(28,28,1)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dense(10, activation='softmax')
    ])
    m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return m

In [None]:
# Train models (use small epochs here; increase for better separation)
EPOCHS = 5
BATCH = 128
hist = {}
for name, maker in [
    ('baseline', make_baseline),
    ('l2', make_l2),
    ('dropout', make_dropout),
    ('batchnorm', make_batchnorm)
]:
    model = maker()
    h = model.fit(x_train, y_train, validation_split=0.2, epochs=EPOCHS, batch_size=BATCH, verbose=1)
    hist[name] = h
    print(name, 'test acc:', model.evaluate(x_test, y_test, verbose=0)[1])

In [None]:
# Visualize validation accuracy & loss
plot_histories(hist, 'val_accuracy', 'Validation Accuracy (Regularization Variants)')
plot_histories(hist, 'val_loss', 'Validation Loss (Regularization Variants)')

### Result & Inference (to be written by student)
- Which variant overfit least and why?
- Which variant trained fastest or most stably?
- Summarize the impact of L2, Dropout, and BatchNorm on generalization.
