In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import GridSearchCV, KFold
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data Collection

In [89]:
# Gradient Optimizer

softmax = lambda z: np.exp(z - z.max(axis=1, keepdims = True)) / np.exp(z - z.max(axis=1, keepdims = True)).sum(axis=1)[:,None]

class Optimizer:
    
    def __init__(self, batch_size=10, learning_rate=0.1, momentum=0.1, max_iters=1e4, epsilon=1e-8, record_history=False):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.max_iters = max_iters
        self.epsilon = epsilon
        self.record_history = record_history
        if record_history:
            self.w_history = []                 #to store the weight history for visualization
            
    def optimize(self, x, y, w):
        grad = np.inf
        t = 0
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            print(cost_fn(x,y,w))
            grad = self.gradient(x, y, w)              # compute the gradient with present weight
            if self.learning_rate == 0:           # special case, use decreasing learning rate
                w = w - (1 / (t + 1)) * grad      # weight update step using 1 / (t+1)
            else:
                w = w - self.learning_rate * grad         # weight update step using specified learning rate
            if self.record_history:
                self.w_history.append(w)
            t += 1
        return [w, t, np.linalg.norm(grad)]
    
    def gradient(self, x, y, w):
        index = np.random.choice(x.shape[0], self.batch_size, replace=False) # choose a minibatch based on size
        x_batch = x[index]
        y_batch = y[index]
        z = softmax(x_batch@w.T) - y_batch            
        grad = np.zeros((w.shape[0],w.shape[1]))
        for row in range(w.shape[0]):
            for col in range(w.shape[1]):
                grad[row][col] = (z[:,row] * x_batch[:,col]).sum(axis=0)    # calculate gradient for each weight vector
        return grad
        

In [90]:
# Cost Function

def cost_fn(x, y, w):                                                   
    z = x@w.T
    J = -((y@z.T).diagonal() - (z.max(axis=1) + np.log(np.exp(z - z.max(axis=1, keepdims=True)).sum(axis=1)))).mean()
    return J

In [88]:
# Softmax Classifier

class softmax_classifier:
    def __init__(self, add_bias=True, verbose=False):
        self.add_bias = add_bias
        self.verbose = verbose
    
    def fit(self, x, y, optimizer):
        if x.ndim == 1:
            x = x[:, None]
        if self.add_bias:
            N = x.shape[0]
            x = np.column_stack([x,np.ones(N)])
        N,D = x.shape
        self.w = np.zeros((y.shape[1],D))
        results = optimizer.optimize(x, y, self.w)
        self.w = results[0]
        
        if self.verbose:
            print(f'terminated after {results[1]} iterations, with norm of the gradient equal to {results[2]}')
            print(f'the weight found: {self.w}')
        return self
    
    def predict(self, x):
        if x.ndim == 1:
            x = x[:, None]
        Nt = x.shape[0]
        if self.add_bias:
            x = np.column_stack([x,np.ones(Nt)])
        z = softmax(x@self.w.T)
        yh = z.argmax(axis=1)           #predict output
        return yh

In [93]:
# Hyperparameter Optimization


In [24]:
# Model for comparison