# Lab 2 : Training an MLP (Keras)

## 1.Load and preprocess data 

In [1]:
from tensorflow.keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [2]:
from tensorflow.keras.utils import to_categorical
# first we do normalization to our data to be in [0,1] and reshape it to 1D array the fed in the mlp model like this:
#X_train = X_train.reshape(-1, 28*28) / 255.0
#X_test = X_test.reshape(-1, 28*28) / 255.0
# but when we will do MLP architecture with keras i will just normilize data witout reshaping to do the reshape with a new function
y_train = to_categorical(y_train, 10) 
y_test = to_categorical(y_test, 10)

## 2. Split the train set into Train and validations sets (20% for validation).

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42
)

# 3. Design an MLP architecture

In [8]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),  
    keras.layers.Dense(128, activation='relu'),  
    keras.layers.Dense(64, activation='relu'),   
    keras.layers.Dense(10, activation='softmax') 
])

# 4. Train the designed architecture

In [10]:
import time
# Training configurations
strategies = [
    {'name': 'SGD (batch=1)', 'batch_size': 1, 'optimizer': keras.optimizers.SGD(learning_rate=0.01)},
    {'name': 'Mini-batch SGD (64)', 'batch_size': 64, 'optimizer': keras.optimizers.SGD(learning_rate=0.01)},
    {'name': 'Batch SGD', 'batch_size': len(X_train), 'optimizer': keras.optimizers.SGD(learning_rate=0.01)}
]

# Train and plot learning curves
histories = []
training_times = []

for strategy in strategies:
    model.compile(optimizer=strategy['optimizer'],
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    
    start_time = time.time()
    history = model.fit(
        X_train, y_train,
        batch_size=strategy['batch_size'],
        epochs=50,
        validation_data=(X_val, y_val),
        verbose=0
    )
    training_time = time.time() - start_time
    
    histories.append(history)
    training_times.append(training_time)
    print(f"Completed {strategy['name']} in {training_time:.2f}s")

# Plot learning curves
def plot_curves(history, title):
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(title + ' Loss')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title(title + ' Accuracy')
    plt.legend()
    plt.show()

for i, history in enumerate(histories):
    plot_curves(history, strategies[i]['name'])

ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(1, 10), output.shape=(1, 10)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint

# 1. Load and preprocess MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize and reshape
X_train = X_train.reshape(-1, 28*28).astype('float32') / 255.0
X_test = X_test.reshape(-1, 28*28).astype('float32') / 255.0
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# 2. Split train set into train and validation (80-20)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Function to create the MLP model
def create_model(optimizer):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(784,)),
        Dense(64, activation='relu'),
        Dense(10, activation='softmax')
    ])
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 4. Train with different batch strategies
def train_and_plot(optimizer, batch_size, epochs=50, label=''):
    model = create_model(optimizer)
    start = time.time()
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                        epochs=epochs, batch_size=batch_size, verbose=0)
    duration = time.time() - start
    
    # Plot learning curves
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f'Loss Curves - {label}')
    plt.legend()
    
    plt.subplot(1,2,2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title(f'Accuracy Curves - {label}')
    plt.legend()
    plt.show()
    
    return history, duration

# Point 4: SGD, Mini-batch (64), Batch SGD
sgd_optimizer = SGD(learning_rate=0.01)
hist_sgd, time_sgd = train_and_plot(sgd_optimizer, batch_size=1, label='SGD (batch=1)')
hist_minibatch, time_minibatch = train_and_plot(sgd_optimizer, batch_size=64, label='Mini-batch SGD (64)')
hist_batch, time_batch = train_and_plot(sgd_optimizer, batch_size=len(X_train), label='Batch SGD')

# 5. Compare Mini-batch SGD variants
optimizers = {
    'Mini-batch SGD': SGD(learning_rate=0.01),
    'Mini-batch SGD + Decay': SGD(learning_rate=0.01, decay=1e-6),
    'Mini-batch SGD + Decay + Momentum': SGD(learning_rate=0.01, decay=1e-6, momentum=0.9)
}

histories_decay = {}
times_decay = {}
for name, opt in optimizers.items():
    hist, t = train_and_plot(opt, batch_size=64, label=name)
    histories_decay[name] = hist
    times_decay[name] = t

# 6. Compare different optimizers
optimizers_compare = {
    'SGD (lr=0.01)': SGD(learning_rate=0.01),
    'Adam (lr=0.001)': Adam(learning_rate=0.001),
    'RMSprop (lr=0.001)': RMSprop(learning_rate=0.001)
}

histories_opt = {}
times_opt = {}
for name, opt in optimizers_compare.items():
    hist, t = train_and_plot(opt, batch_size=64, label=name)
    histories_opt[name] = hist
    times_opt[name] = t

# 7. Save the best model based on validation accuracy
best_val_accuracy = max([max(hist.history['val_accuracy']) for hist in histories_opt.values()])
for name, hist in histories_opt.items():
    if max(hist.history['val_accuracy']) == best_val_accuracy:
        model = create_model(optimizers_compare[name])
        model.fit(X_train, y_train, epochs=50, batch_size=64, verbose=0)  # Retrain to get the model
        model.save('best_model.h5')
        break

# Load the best model and evaluate
best_model = load_model('best_model.h5')
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")

# Predict on some test samples
sample_images = X_test[:5]
predictions = best_model.predict(sample_images)
predicted_classes = np.argmax(predictions, axis=1)
print("Predicted Classes:", predicted_classes)
print("True Classes:", np.argmax(y_test[:5], axis=1))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
