# MNIST Classification with Feedforward Neural Network in Keras

### 1. Data Loading and Preprocessing

In [1]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.utils import to_categorical

In [2]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values to [0, 1]
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# One-hot Encode Labels
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

### 2. Baseline Model with ReLU + Adam + Softmax

In [3]:
def build_model(activation="relu", optimizer="adam", hidden_layers=2):
    model = Sequential()
    model.add(Flatten(input_shape=(28, 28)))
    
    # First hidden layer
    model.add(Dense(128, activation=activation))
    
    if hidden_layers == 2:
        model.add(Dense(64, activation=activation))  # Second hidden layer

    model.add(Dense(10, activation="softmax"))  # Output layer
    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    return model

# Build and train the model
model = build_model()
model.summary()
history = model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

  super().__init__(**kwargs)


Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9237 - loss: 0.2614 - val_accuracy: 0.9677 - val_loss: 0.1141
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9663 - loss: 0.1094 - val_accuracy: 0.9722 - val_loss: 0.0896
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9767 - loss: 0.0746 - val_accuracy: 0.9752 - val_loss: 0.0930
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9820 - loss: 0.0562 - val_accuracy: 0.9783 - val_loss: 0.0769
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9859 - loss: 0.0433 - val_accuracy: 0.9767 - val_loss: 0.0882
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9735 - loss: 0.0866
Test Accuracy: 0.9735


### 3. Experiment 1: Sigmoid vs ReLU
> Goal: Understand the effect of using **sigmoid** instead of **relu** in hidden layers.

In [4]:
# Sigmoid model
sigmoid_model = build_model(activation="sigmoid", optimizer="adam")
sigmoid_model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
_, sigmoid_acc = sigmoid_model.evaluate(x_test, y_test)

# ReLU model (again for comparison)
relu_model = build_model(activation="relu", optimizer="adam")
relu_model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
_, relu_acc = relu_model.evaluate(x_test, y_test)

print(f"ReLU Accuracy: {relu_acc:.4f}")
print(f"Sigmoid Accuracy: {sigmoid_acc:.4f}")

Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8731 - loss: 0.5000 - val_accuracy: 0.9507 - val_loss: 0.1802
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9458 - loss: 0.1851 - val_accuracy: 0.9638 - val_loss: 0.1316
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9620 - loss: 0.1286 - val_accuracy: 0.9707 - val_loss: 0.0992
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9721 - loss: 0.0958 - val_accuracy: 0.9723 - val_loss: 0.0875
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9779 - loss: 0.0737 - val_accuracy: 0.9745 - val_loss: 0.0833
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9729 - loss: 0.0854
Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0

#### Observation:
- ReLU usually trains faster and gives better accuracy due to avoiding vanishing gradients.
- Sigmoid may lead to slower convergence.

### 4. Experiment 2: Softmax vs Other Output Activations
> Goal: Compare **softmax** with **sigmoid** in output layer.

In [5]:
# Incorrect output activation for multi-class: sigmoid
def build_bad_output_model():
    model = Sequential()
    model.add(Flatten(input_shape=(28, 28)))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(10, activation="sigmoid"))  # Not ideal for multi-class!
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

bad_output_model = build_bad_output_model()
bad_output_model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
_, bad_acc = bad_output_model.evaluate(x_test, y_test)

print(f"Softmax Accuracy (from earlier): {relu_acc:.4f}")
print(f"Sigmoid Output Accuracy: {bad_acc:.4f}")

Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9250 - loss: 0.2594 - val_accuracy: 0.9620 - val_loss: 0.1274
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9666 - loss: 0.1092 - val_accuracy: 0.9668 - val_loss: 0.1211
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9758 - loss: 0.0768 - val_accuracy: 0.9745 - val_loss: 0.0852
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9815 - loss: 0.0585 - val_accuracy: 0.9755 - val_loss: 0.0828
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9858 - loss: 0.0450 - val_accuracy: 0.9800 - val_loss: 0.0743
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9770 - loss: 0.0728
Softmax Accuracy (from earlier): 0.9782
Sigmoid Output Accuracy: 0.9770


#### Observation:

- Softmax outputs probabilities that sum to 1, ideal for categorical classification.
- Sigmoid outputs independent probabilities per class, which doesn’t work well for multi-class soft targets.

### 5. Experiment 3: Depth: 1 vs 2 Hidden Layers

In [6]:
# 1 hidden layer
model_1hl = build_model(hidden_layers=1)
model_1hl.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
_, acc_1hl = model_1hl.evaluate(x_test, y_test)

# 2 hidden layers
model_2hl = build_model(hidden_layers=2)
model_2hl.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
_, acc_2hl = model_2hl.evaluate(x_test, y_test)

print(f"1 Hidden Layer Accuracy: {acc_1hl:.4f}")
print(f"2 Hidden Layers Accuracy: {acc_2hl:.4f}")

Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9204 - loss: 0.2766 - val_accuracy: 0.9665 - val_loss: 0.1271
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9652 - loss: 0.1190 - val_accuracy: 0.9740 - val_loss: 0.0943
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9757 - loss: 0.0811 - val_accuracy: 0.9730 - val_loss: 0.0905
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9818 - loss: 0.0600 - val_accuracy: 0.9770 - val_loss: 0.0838
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.9855 - loss: 0.0468 - val_accuracy: 0.9772 - val_loss: 0.0828
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9762 - loss: 0.0763
Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s

#### Observation:
> Deeper networks can capture more complex patterns but may overfit or train slower without enough data or regularization.

### 6. Experiment 4: Optimizer Comparison (adam, sgd, rmsprop)

In [7]:
optimizers = ["adam", "sgd", "rmsprop"]
results = {}

for opt in optimizers:
    print(f"\nTraining with optimizer: {opt}")
    model = build_model(activation="relu", optimizer=opt)
    model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
    _, acc = model.evaluate(x_test, y_test)
    results[opt] = acc

for opt, acc in results.items():
    print(f"{opt.upper()} Test Accuracy: {acc:.4f}")


Training with optimizer: adam
Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9249 - loss: 0.2558 - val_accuracy: 0.9623 - val_loss: 0.1166
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9676 - loss: 0.1063 - val_accuracy: 0.9700 - val_loss: 0.0973
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9770 - loss: 0.0739 - val_accuracy: 0.9732 - val_loss: 0.0910
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9817 - loss: 0.0561 - val_accuracy: 0.9763 - val_loss: 0.0840
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9858 - loss: 0.0441 - val_accuracy: 0.9730 - val_loss: 0.0886
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9749 - loss: 0.0863

Training with optimizer: sgd
Epoch 1/5
[1

#### Observation:
- Adam usually performs best out of the box.
- SGD might converge slowly but can be improved with tuning (like learning rate schedules).
- RMSprop can work well on some datasets, especially with recurrent networks.