<a href="https://colab.research.google.com/github/jackiekuen2/notes-handson-ml-tf/blob/master/ch4_ExerciseQ12_EarlyStopping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Q12. Batch Gradient Descent with early stopping for Softmax Regression (without using Scikit-Learn)

## I. Load dataset

In [0]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [39]:
iris = datasets.load_iris()
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [0]:
X = iris.data[:, (2, 3)] #petal length, petal width
y = iris.target

In [0]:
# Add the bias term for every instance (x0 = 1)
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [42]:
X_with_bias[:5]

array([[1. , 1.4, 0.2],
       [1. , 1.4, 0.2],
       [1. , 1.3, 0.2],
       [1. , 1.5, 0.2],
       [1. , 1.4, 0.2]])

In [0]:
np.random.seed(42)

## II. Train-test Split

In [0]:
total_size = len(X_with_bias)
test_ratio = 0.2
validation_ratio = 0.2

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

In [0]:
# Shuffle the dataset
rnd_indices = np.random.permutation(total_size)

X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [0]:
# Convert test data to one-hot encoding
def to_one_hot(y):
    n_classes = y.max() + 1 # (0, 1, 2) --> total 3 classes
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes)) # label all 0's first
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot


Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

## III. Training a Softmax model

In [0]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps/exp_sums

In [0]:
n_inputs = X_train.shape[1] # 2 features + bias teem = 3 inputs
n_outputs = len(np.unique(y_train)) # 3 classes

In [49]:
# Train the softmax model
eta = 0.01
n_iterations = 10001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs) # Initialize with random figures

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    error = Y_proba - Y_train_one_hot
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(error)
    Theta = Theta - eta * gradients

0 3.5356045081790177
500 0.7698276617097016
1000 0.6394784332731978
1500 0.5618741363839648
2000 0.5095831080853221
2500 0.47127377559909306
3000 0.44155863305230325
3500 0.41755986648041216
4000 0.3975941721521857
4500 0.38060484552797946
5000 0.3658905593000994
5500 0.35296466414435634
6000 0.34147705259255917
6500 0.33116823861572764
7000 0.3218410572511107
7500 0.3133425076366726
8000 0.30555169703144486
8500 0.298371595843712
9000 0.29172325270873556
9500 0.2855416438026909
10000 0.2797726355460243


In [50]:
Theta

array([[ 3.62016071, -1.50976478, -4.92912459],
       [-0.846549  ,  0.69453656,  0.25669351],
       [-1.37970405, -0.04055292,  3.33700883]])

In [51]:
# Predictions for validation set

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_pred = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean((y_pred == y_valid))
accuracy_score

0.9333333333333333

##IV. Softmax model + L2 Regularization



In [52]:
eta = 0.01
n_iterations = 10001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    error = Y_proba - Y_train_one_hot
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

0 4.074160805836161
500 0.842423506343155
1000 0.7077659962126894
1500 0.6367414013918682
2000 0.5952648892708439
2500 0.5689591903221461
3000 0.551156976597229
3500 0.5384699412305943
4000 0.529040469145154
4500 0.5217862918782573
5000 0.5160448760335734
5500 0.5113933844416791
6000 0.5075519836677346
6500 0.5043293709329311
7000 0.5015908506698222
7500 0.4992389882701472
8000 0.49720153694764324
8500 0.49542370257763185
9000 0.4938630670775326
9500 0.4924861812387983
10000 0.49126622945561005


In [53]:
# Predictions for validation set

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_pred = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean((y_pred == y_valid))
accuracy_score

0.9333333333333333

## V. Softmax model + L2 Regularization + Early Stopping

In [54]:
eta = 0.01
n_iterations = 10001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter
best_loss = np.infty

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

    # Early stopping
    logits = X_valid.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_valid_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss

    if iteration % 500 == 0:
        print(iteration, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print(iteration - 1, best_loss)
        print(iteration, loss, "Reaching the best loss, Early stopping!")
        break

0 2.1499523939452976
500 1.1331943621656417
1000 0.8449214146527371
1500 0.7191016939830108
2000 0.6595407268250023
2500 0.626994603586099
3000 0.6069434147592495
3500 0.5934704952736416
4000 0.5838241988882021
4500 0.5765729480190931
5000 0.570907156969735
5500 0.5663401875471603
6000 0.562565511240924
6500 0.5593824514592203
7000 0.5566550855491896
7500 0.5542884213522633
8000 0.5522140869292202
8500 0.5503814821687892
9000 0.5487521696314078
9500 0.5472962380511188
10000 0.5459898947672274


In [55]:
logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_pred = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_pred == y_valid)
accuracy_score

0.9333333333333333