In [1]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class BatchGradientDescentClassifier(BaseEstimator, TransformerMixin):
    '''
    Fits Batch Gradient Descent Classifier model.
    Supports only binary classification.
    It is required to scale the input features before fitting the model. 
    
    Params:
    eta (float): Learning rate
    epochs (int): number of epochs
    tol (float): minimum reduction in loss required to continue training
    threshold (float): threshold value to convert prediction probabilities into labels
    predict_proba (bool): return probabilities or labels as predictions
    verbose (bool): controls verbosity
    '''
    def __init__(self, eta: float = 0.001, epochs: int=1000, tol: float=0.001, 
                 threshold: float=0.5, predict_proba: bool=False, 
                 verbose: bool=True, random_state: int=42):
        
        self.eta = eta
        self.epochs = epochs
        self.tol = tol
        self.threshold = threshold
        self.predict_proba = predict_proba
        self.verbose = verbose
        self.random_state = random_state
        self.weights = None
        self.__z = None
        self.__af = None
        self.loss = 1
        self.epsilon = 0.0000001 # prevents divisionbyzeroerror and log(0) conditions
        self.loss_history = [0]


    def __initialise_weights(self, n_col: int): # random weight initialization
        np.random.seed(self.random_state)
        self.weights = np.random.randn(n_col)

        
    def __sum_function(self, x: np.array):
        self.__z = np.dot(x, self.weights.reshape(-1, 1)) # X.W
        self.__af = 1 / (1 + np.exp(-1 * self.__z.ravel())) # sigmoid function = 1 / (1 + (e^-z))

        
    def __weight_update(self, x: np.array, y: np.array): # update weights
        n_row = len(self.__af)
        self.__errors = self.__af - y.ravel() # y_hat - y
        self.weights = self.weights - (self.eta * (1 / n_row) * np.dot(self.__errors, x)) # weight updation

        
    def __loss_update(self, y: np.array): # update loss
        # log loss = (-1/n).(y.log(y_hat) + (1-y).log(1-y_hat))
        self.loss = -1 * ((y.ravel() * np.log(self.__af + self.epsilon)) + ((1 - y.ravel()) * np.log(1 - self.__af + self.epsilon)))
        self.loss = np.mean(self.loss)

        
    def fit(self, x: np.array, y: np.array):
        n_row, _ = x.shape
        ones = np.ones((n_row, 1)) # array of 1s
        x = np.hstack((ones, x)) # appending array of 1s to x as bias feature
        _, n_col = x.shape
        self.__initialise_weights(n_col=n_col) # random weight initialization
        epoch = 0

        while epoch < self.epochs: 
            self.__sum_function(x=x) # prediction
            self.__weight_update(x=x, y=y) # weight update
            self.__loss_update(y) # update loss
            epoch += 1
            if self.verbose:
                print(f'Epoch {epoch} / {self.epochs}\tTraining Log loss: {np.round(self.loss, 5)}')
            self.loss_history.append(self.loss) # add loss to loss_history list
            
            if np.abs(self.loss_history[-1] - self.loss_history[-2]) < self.tol:
                if self.verbose:
                    print("Exiting as reduction in Log loss < tol")
                break
        return self

                
    def predict(self, x: np.array):
        n_row, _ = x.shape
        ones = np.ones((n_row, 1)) # array of 1s
        x = np.hstack((ones, x)) # appending array of 1s to x as bias feature
        z = np.dot(x, self.weights.reshape(-1, 1)) # X.W
        pred_prob = 1 / (1 + np.exp(-1 * z.ravel())) # sigmoid function = 1 / (1 + (e^-z))
        if not self.predict_proba:
            pred = (pred_prob >= self.threshold).astype(np.int8) # convert probs to labels using threshold
            return pred # labels prediction
        return pred_prob # prediction probs
    
    def __repr__(self):
        return f"BatchGradientDescentClassifier(eta={self.eta}, epochs={self.epochs}, tol={self.tol}, " + \
                f"threshold={self.threshold}, predict_proba={self.predict_proba}, verbose={self.verbose}, " + \
                f"random_state={self.random_state})"

<h3>Testing the algorithm on breast cancer dataset

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import GridSearchCV

In [4]:
data = load_breast_cancer()

In [5]:
x = data['data']
y = data['target']

In [6]:
x.shape, y.shape

((569, 30), (569,))

In [7]:
pd.Series(y).value_counts()

1    357
0    212
dtype: int64

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
model_pipeline = imbpipeline(steps=[['smote', SMOTE(random_state=42)],
                                    ['minmax_scaler', MinMaxScaler()],
                                    ['classifier', BatchGradientDescentClassifier(verbose=False)]])

In [10]:
param_grid = {'classifier__eta': [0.01, 0.1, 1, 10],
              'classifier__tol': [0.00001, 0.000001],
              'classifier__epochs': [1500, 2000, 2500, 3000],
              'classifier__threshold': [*np.linspace(0.3, 0.8, 10)]}

In [11]:
grid_search = GridSearchCV(estimator=model_pipeline, 
                           param_grid=param_grid,
                           scoring="balanced_accuracy",
                           n_jobs=-1,
                           cv=3,
                           verbose=1)

In [12]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


In [13]:
grid_search.best_params_

{'classifier__epochs': 2000,
 'classifier__eta': 1,
 'classifier__threshold': 0.5222222222222221,
 'classifier__tol': 1e-05}

In [14]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__epochs,param_classifier__eta,param_classifier__threshold,param_classifier__tol,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.307614,0.007364,0.005207,0.007364,1500,0.01,0.3,0.00001,"{'classifier__epochs': 1500, 'classifier__eta'...",0.780702,0.812500,0.784155,0.792452,0.014246,295
1,0.312823,0.012755,0.005207,0.007364,1500,0.01,0.3,0.000001,"{'classifier__epochs': 1500, 'classifier__eta'...",0.780702,0.812500,0.784155,0.792452,0.014246,295
2,0.281447,0.022279,0.000000,0.000000,1500,0.01,0.355556,0.00001,"{'classifier__epochs': 1500, 'classifier__eta'...",0.857895,0.866071,0.849011,0.857659,0.006967,283
3,0.239525,0.007364,0.000000,0.000000,1500,0.01,0.355556,0.000001,"{'classifier__epochs': 1500, 'classifier__eta'...",0.857895,0.866071,0.849011,0.857659,0.006967,283
4,0.249940,0.012755,0.005207,0.007364,1500,0.01,0.411111,0.00001,"{'classifier__epochs': 1500, 'classifier__eta'...",0.887719,0.896577,0.891004,0.891767,0.003656,267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,0.514115,0.010693,0.001662,0.000470,3000,10,0.688889,0.000001,"{'classifier__epochs': 3000, 'classifier__eta'...",0.929825,0.968006,0.989362,0.962397,0.024627,99
316,0.199495,0.036459,0.006205,0.006671,3000,10,0.744444,0.00001,"{'classifier__epochs': 3000, 'classifier__eta'...",0.945614,0.952381,0.984043,0.960679,0.016750,123
317,0.503735,0.017893,0.001662,0.000470,3000,10,0.744444,0.000001,"{'classifier__epochs': 3000, 'classifier__eta'...",0.929825,0.962798,0.989362,0.960661,0.024353,127
318,0.211768,0.033827,0.001663,0.000471,3000,10,0.8,0.00001,"{'classifier__epochs': 3000, 'classifier__eta'...",0.940351,0.947173,0.968085,0.951870,0.011800,164


In [15]:
pred = grid_search.predict(x_test)
print(pred)

[0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1
 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0
 0 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1
 0 1 1]


In [16]:
print(y_test)

[0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1
 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0
 0 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1
 0 1 1]


In [17]:
# Test balanced Accuracy
balanced_accuracy_score(y_test, pred)

0.9672619047619048