# Softmax Regression w/ Early Stopping 
### (Batch Gradient Descent)

In [1]:
# libraries
from sklearn import datasets
import pandas as pd
import numpy as np

In [80]:
iris = datasets.load_iris()
# list(iris.keys()) (legend)

X = iris['data'][:, (2, 3)] # 150 x 2
X = np.c_[np.ones([len(X), 1]), X] # add 1's column basically (150 x 3)
y = iris['target'] # 150 x 1

class SoftmaxClassifier():
    def __init__(self, X, y): 
        self.X         = X
        self.y         = y
        self.m         = X.shape[0]
        self.n         = X.shape[1]
        self.n_inputs  = X.shape[1]        # number of columns
        self.n_outputs = len(np.unique(y)) # creates an array of unique classes and stores the len of that array
        
        self.X_train = None
        self.y_train = None
        self.X_test  = None
        self.y_test  = None
        self.X_valid = None
        self.y_valid = None
        
        self.train_ratio      = 0.6
        self.test_ratio       = 0.2
        self.validation_ratio = 0.2
        
        self.train_test_split() # create the 3 sets before any other manipulation 
        
        return
        
    def encode_classes(self, y):
        y_dims                                  = y.shape
        n_classes                               = y.max() + 1
        one_hot_vector                          = np.zeros((y_dims[0], n_classes)) # introduce int or TUPLE of ints
        one_hot_vector[np.arange(y_dims[0]), y] = 1 # arange creates an array with the size of y_train (30) in this case 
            
        return one_hot_vector 
        
    def train_test_split(self): 
        total_size      = self.m
        test_size       = int(total_size * self.test_ratio)
        validation_size = int(total_size * self.validation_ratio)
        train_size      = total_size - test_size - validation_size
        random_indices  = np.random.permutation(total_size) # creates shuffled array in that range
        
        self.X_train = self.X[random_indices[:train_size]]
        self.y_train = self.y[random_indices[:train_size]]
        self.X_test  = self.X[random_indices[train_size:(train_size + test_size)]]
        self.y_test  = self.y[random_indices[train_size:(train_size + test_size)]]
        self.X_valid = self.X[random_indices[(train_size + test_size):(train_size + test_size + validation_size)]]
        self.y_valid = self.y[random_indices[(train_size + test_size):(train_size + test_size + validation_size)]]
        
        self.y_train = self.encode_classes(self.y_train)
        
        return 
        
    def calculate_softmax_score(self, X, theta): 
        return X.dot(theta)
    
    def calculate_softmax_function(self, logits):
        numerator   = np.exp(logits)
        denominator = np.sum(numerator, axis=1, keepdims=True) # keepdims keep the original dimension of the matrix
        
        return numerator / denominator

    def train_model(self):
        eta          = 0.01
        n_iterations = 10000 
        alpha        = 0.1 # regularization 
        epsilon      = 1e-6
        best_loss    = np.inf # initialize at infinity 
        theta        = np.random.rand(self.n_inputs, self.n_outputs) # 3 x 3  

        for iteration in range(n_iterations):
            # normal training w/ gradient descent
            logits        = self.calculate_softmax_score(self.X_train, theta)
            y_proba       = self.calculate_softmax_function(logits)    
            gradients     = (1 / self.m) * (self.X_train.T.dot(y_proba - self.y_train)) + np.r_[np.zeros([1, self.n_outputs]), alpha * theta[1:]]
            theta         = theta - (eta * gradients) # new theta value
            
            # manipulation of validation set for early stopping
            logits_valid    = self.calculate_softmax_score(self.X_valid, theta)
            y_proba_valid   = self.calculate_softmax_function(logits_valid)
            y_valid_encode  = self.encode_classes(self.y_valid)   
            cross_entropy   = -np.mean(np.sum(y_valid_encode * np.log(y_proba_valid + epsilon), axis=1))
            loss_score      = cross_entropy + (alpha * ((1/2) * np.sum(np.square(theta[1:]))))
        
            if loss_score < best_loss: 
                best_loss = loss_score
            else: 
                break # early stopping
            
        return theta
    
    def valid_predict(self):
        theta           = self.train_model()
        logits_valid    = self.calculate_softmax_score(self.X_valid, theta)
        y_proba_valid   = self.calculate_softmax_function(logits_valid)
        y_predict_valid = np.argmax(y_proba_valid, axis=1)
        accuracy        = np.mean(y_predict_valid == self.y_valid)
        
        return accuracy
    
    def test_predict(self): 
        theta          = self.train_model()
        logits_test    = self.calculate_softmax_score(self.X_test, theta)
        y_proba_test   = self.calculate_softmax_function(logits_test)
        y_predict_test = np.argmax(y_proba_test, axis=1)
        accuracy       = np.mean(y_predict_test == self.y_test)
        
        return accuracy
        
softmax_classifier_iris = SoftmaxClassifier(X, y)
softmax_classifier_iris.test_predict()

0.9666666666666667