In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Data Collection
penguins = pd.read_csv('penguins.csv')

#clearing missing values
penguins = penguins.loc[(penguins['sex']!='?') & (penguins['sex']!='_')]

#one-hot encoding for all the categorical variables
dummies1 = pd.get_dummies(penguins.island)

dummies2 = pd.get_dummies(penguins.sex)

dummies3 = pd.get_dummies(penguins.species)


penguins = pd.concat([penguins,dummies1,dummies2,dummies3],axis='columns')
penguins = penguins.drop(['species', 'island', 'sex'], axis='columns')
display(penguins)

penguinsX= penguins.iloc[:, :9]
penguinsy= penguins.iloc[: , 9:]
print("penguinsX")
display(penguinsX)
print("penguinsy")
display(penguinsy)

penguinsX = np.array(penguinsX)
penguinsX = penguinsX.astype('float64')
penguinsy = np.array(penguinsy)
penguinsy = penguinsy.astype('float64')

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,FEMALE,MALE,Adelie,Chinstrap,Gentoo
0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1,0,0
1,39.5,17.4,186.0,3800.0,0,0,1,1,0,1,0,0
2,40.3,18.0,195.0,3250.0,0,0,1,1,0,1,0,0
4,36.7,19.3,193.0,3450.0,0,0,1,1,0,1,0,0
5,39.3,20.6,190.0,3650.0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,1,0,0,1,0,0,0,1
340,46.8,14.3,215.0,4850.0,1,0,0,1,0,0,0,1
341,50.4,15.7,222.0,5750.0,1,0,0,0,1,0,0,1
342,45.2,14.8,212.0,5200.0,1,0,0,1,0,0,0,1


penguinsX


Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Biscoe,Dream,Torgersen,FEMALE,MALE
0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,1,0,0,1,0
340,46.8,14.3,215.0,4850.0,1,0,0,1,0
341,50.4,15.7,222.0,5750.0,1,0,0,0,1
342,45.2,14.8,212.0,5200.0,1,0,0,1,0


penguinsy


Unnamed: 0,Adelie,Chinstrap,Gentoo
0,1,0,0
1,1,0,0
2,1,0,0
4,1,0,0
5,1,0,0
...,...,...,...
338,0,0,1
340,0,0,1
341,0,0,1
342,0,0,1


In [8]:
# Gradient Optimizer

softmax = lambda z: np.exp(z - z.max(axis=1, keepdims = True)) / np.exp(z - z.max(axis=1, keepdims = True)).sum(axis=1)[:,None]

class Optimizer:
    
    def __init__(self, batch_size=10, learning_rate=0.1, momentum=0, max_iters=1e4, epsilon=1e-8, record_history=False, l1=0, l2=0):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.max_iters = max_iters
        self.epsilon = epsilon
        self.record_history = record_history
        self.l1 = l1
        self.l2 = l2
        if record_history:
            self.w_history = []                 #to store the weight history for visualization
            
    def optimize(self, x, y):
        N,D = x.shape
        w = np.zeros((y.shape[1],D))
        grad = np.inf
        t = 0
        while np.linalg.norm(grad) > self.epsilon and t < self.max_iters:
            grad_prev = grad
            grad = self.gradient(x, y, w) + self.l1 * np.column_stack([np.sign(w[:,1:]),np.zeros(w.shape[0])]) + self.l2 * np.column_stack([w[:,1:],np.zeros(w.shape[0])])  # compute the gradient, apply l1 or l2 regularization if specified
            if t > 0:
                grad = self.momentum * grad_prev + (1 - self.momentum) * grad
            if self.learning_rate == 0:           # special case, use decreasing learning rate
                w = w - (1 / (t + 1)) * grad      # weight update step using 1 / (t+1)
            else:
                w = w - self.learning_rate * grad         # weight update step using specified learning rate
            if self.record_history:
                self.w_history.append(w)
            t += 1
        return [w, t, np.linalg.norm(grad)]
    
    def gradient(self, x, y, w):
        index = np.random.choice(x.shape[0], self.batch_size, replace=False) # choose a minibatch based on size
        x_batch = x[index]
        y_batch = y[index]
        z = softmax(x_batch@w.T) - y_batch
        return z.T@x_batch
        

In [9]:
# Cost Function

def cost_fn(x, y, w):                                                   
    z = x@w.T
    J = -((y * z).sum(axis=1) - (z.max(axis=1) + np.log(np.exp(z - z.max(axis=1, keepdims=True)).sum(axis=1)))).mean()
    return J

In [10]:
# Softmax Classifier

class softmax_classifier:
    def __init__(self, verbose=False):
        self.verbose = verbose
    
    def fit(self, x, y, optimizer):
        if x.ndim == 1:
            x = x[:, None]
        N = x.shape[0]
        x = np.column_stack([x,np.ones(N)])
        results = optimizer.optimize(x, y)
        self.w = results[0]
        
        if self.verbose:
            print(f'terminated after {results[1]} iterations, with norm of the gradient equal to {results[2]}')
            print(f'the weight found: {self.w}')
        return self
    
    def predict(self, x):
        if x.ndim == 1:
            x = x[:, None]
        Nt = x.shape[0]
        x = np.column_stack([x,np.ones(Nt)])
        z = softmax(x@self.w.T)
        yh = z.argmax(axis=1)           #predict output
        return yh

In [9]:
# Hyperparameter Optimization
digits=load_digits()
X = digits.data
targets = digits.target
y = np.zeros((X.shape[0], 10))
for i in range(X.shape[0]):
    y[i][targets[i]] = 1
for train, test in KFold().split(X):
    cl = softmax_classifier()
    optimizer = Optimizer(learning_rate = 0, momentum = 0.9, l1=0.1, l2=0.1)
    cl.fit(np.array([X[i] for i in train]), np.array([y[i] for i in train]), optimizer)
    print(accuracy_score([targets[i] for i in test], cl.predict(np.array([X[i] for i in test]))))

0.8083333333333333
0.8
0.8579387186629527
0.7771587743732591
0.7743732590529248


In [11]:
def get_softmax_score(X_train, X_test, y_train, y_test, optimizer):
    sf= softmax_classifier().fit(X_train, y_train, optimizer)
    predictions = sf.predict(X_test)
    return accuracy_score(y_test.argmax(axis=1), predictions)

In [19]:
def softmax_k_fold(X, y, optimiser, k=5):
    stkf = StratifiedKFold(n_splits=k)
    scores=[]
    for train_index, test_index in stkf.split(X, y.argmax(axis=1)):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]      
        scores.append(get_softmax_score(X_train, X_test, y_train, y_test, optimizer))
    return np.average(np.array(scores))

In [None]:
b_sizes = [1, 5, 10, 20, 50, 100, 200, 300]
lrs= [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1]
betas=[0, 0.1, 0.25, 0.5, 0.75, 0.9]

for bs in b_sizes:
    for lr in lrs:
        for beta in betas:
            optimizer = Optimizer(batch_size=bs, learning_rate = lr, momentum = beta)
            print(f'done with size {bs}, lr {lr}, beta {beta}')
            print(softmax_k_fold(penguinsX, penguinsy, optimizer, k=5))

done with size 1, lr 0, beta 0
0.40836725463591134
done with size 1, lr 0, beta 0.1
0.594843962008141
done with size 1, lr 0, beta 0.25
0.5919041157847127
done with size 1, lr 0, beta 0.5
0.5952510176390773
done with size 1, lr 0, beta 0.75
0.5643600180913613
done with size 1, lr 0, beta 0.9
0.6219357756671189
done with size 1, lr 0.1, beta 0
0.37177747625508817
done with size 1, lr 0.1, beta 0.1


In [24]:
# Model for comparison