# Module 2 — Experiment 2: Optimizers & Learning Rate Schedules (Fashion‑MNIST)

**Question (L3 – Apply):**  
Apply and compare **SGD**, **SGD+Momentum**, **RMSProp**, and **Adam**, along with a **learning‑rate decay** strategy. Use **Fashion‑MNIST** and evaluate **convergence speed**, **stability**, and **accuracy**.

**Learning Targets:** Mini‑batch GD, Momentum, RMSProp, Adam, LR decay, Vanishing/Exploding awareness.


In [None]:
# Imports & data
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train.astype('float32')/255.0
x_test  = x_test.astype('float32')/255.0
x_train = x_train[..., None]
x_test  = x_test[..., None]

print('Train:', x_train.shape, y_train.shape, '| Test:', x_test.shape, y_test.shape)

In [None]:
# Model definition (same backbone for fairness)
from tensorflow.keras import layers, models

def make_cnn():
    m = models.Sequential([
        layers.Conv2D(32, 3, activation='relu', input_shape=(28,28,1)),
        layers.MaxPooling2D(),
        layers.Conv2D(64, 3, activation='relu'),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    return m

# LR schedule example: exponential decay
def make_optimizer(name='sgd', lr=0.01, momentum=0.9):
    if name == 'sgd':
        return tf.keras.optimizers.SGD(learning_rate=lr)
    if name == 'momentum':
        return tf.keras.optimizers.SGD(learning_rate=lr, momentum=momentum)
    if name == 'rmsprop':
        return tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)
    if name == 'adam':
        return tf.keras.optimizers.Adam(learning_rate=lr)
    raise ValueError(name)

In [None]:
# Training utility with optional LR scheduler
def train_with_opt(opt_name, base_lr=0.01, epochs=5, batch=128, use_exp_decay=False):
    model = make_cnn()
    if use_exp_decay:
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=base_lr,
            decay_steps=200, decay_rate=0.96, staircase=True
        )
        optimizer = make_optimizer(opt_name, lr=lr_schedule)
    else:
        optimizer = make_optimizer(opt_name, lr=base_lr)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    h = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch, verbose=1)
    test_acc = model.evaluate(x_test, y_test, verbose=0)[1]
    return h, test_acc

In [None]:
# Run experiments
EPOCHS = 6
hist = {}
test_accs = {}
for name in ['sgd', 'momentum', 'rmsprop', 'adam']:
    h, acc = train_with_opt(name, base_lr=0.01, epochs=EPOCHS, use_exp_decay=False)
    hist[name] = h
    test_accs[name] = acc
print('Test accuracies:', {k: round(v,4) for k,v in test_accs.items()})

In [None]:
# Add learning-rate decay with Adam (as an example)
h_decay, acc_decay = train_with_opt('adam', base_lr=0.005, epochs=EPOCHS, use_exp_decay=True)
hist['adam+expdecay'] = h_decay
test_accs['adam+expdecay'] = acc_decay
print('Adam+ExpDecay test acc:', round(acc_decay,4))

In [None]:
# Plot validation accuracy and loss for all runs
def plot_histories(histories, metric='val_accuracy', title='Validation Accuracy'):
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    for name, h in histories.items():
        ax.plot(h.history[metric], label=name)
    ax.set_xlabel('Epoch'); ax.set_ylabel(metric.replace('_',' ').title())
    ax.set_title(title); ax.legend(); plt.show()

plot_histories(hist, 'val_accuracy', 'Validation Accuracy (Optimizers & LR)')
plot_histories(hist, 'val_loss', 'Validation Loss (Optimizers & LR)')

### Result & Inference (to be written by student)
- Which optimizer converged fastest and which gave best accuracy?
- Did learning‑rate decay improve stability or final performance?
- Any signs of vanishing/exploding gradients (e.g., unstable/flat loss)?
