# Batch, Stochastic and Minibatch Gradient Descent

In [1]:
%store -r X_train y_train X_test y_test

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
X_train.head(3)

Unnamed: 0,bmi,s5
381,0.004132,0.435853
392,0.231405,0.326986
155,0.578512,0.631167


In [4]:
y_train.head(3)

381    0.246106
392    0.289720
155    0.501558
Name: Y, dtype: float64

In [5]:
X_test.head(3)

Unnamed: 0,bmi,s5
203,0.46281,0.583337
14,0.247934,0.362369
112,0.42562,0.39451


In [6]:
y_test.head(3)

203    0.613707
14     0.289720
112    0.688474
Name: Y, dtype: float64

In [7]:
X_train["ones_feature"] = np.ones((len(X_train), 1))
X_test["ones_feature"] = np.ones((len(X_test), 1))
X_train.head(3)

Unnamed: 0,bmi,s5,ones_feature
381,0.004132,0.435853,1.0
392,0.231405,0.326986,1.0
155,0.578512,0.631167,1.0


In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((353, 3), (353,), (89, 3), (89,))

In [9]:
X_train = np.array(X_train)
y_train = np.array(y_train).reshape((len(y_train), 1))
X_test = np.array(X_test)
y_test = np.array(y_test).reshape((len(y_test), 1))
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((353, 3), (353, 1), (89, 3), (89, 1))

## Defining the function for Gradient Descent

In [11]:
def gradient_descent(x, y, batch_size = 1, epochs = 1, learning_rate = 0.01, tolerance = None):
    """
    Calculates gradient vector of the cost function wrt each model parameter and also updates the parameter.
    Inputs: x (predictors' values), y (actual targets), batch_size (number of instances to feed at each step, default is 1),
    epochs (number of epochs, default is 1), learning_rate (default is 0.01) and tolerance (default is None).
    Outputs: epoch_list (a list with epoch numbers), parameters_list (list of all the model parameters at each epoch), cost_list (list of costs at each epoch).
    """
    parameters = np.random.randn(3, 1)
    epoch_list = []
    parameters_list = []
    cost_list = []
    for epoch in range(epochs):
        index_shuffled = np.random.permutation(len(x))
        x = x[index_shuffled]
        y = y[index_shuffled]
        epoch_list.append(epoch)
        parameters_list.append((parameters[0], parameters[1], parameters[2]))
        cost_list.append(mse(x, y, parameters))
        for i in range(0, len(x), batch_size):
            x_index = x[i: i + batch_size]
            y_index = y[i: i + batch_size]
            cost = mse(x, y, parameters)
            gradients = 2 / batch_size * x_index.T.dot(x_index.dot(parameters) - y_index)
            parameters = parameters - (learning_rate * gradients)
            if tolerance != None and cost <= tolerance:
                return epoch_list, parameters_list, cost_list 
    return epoch_list, parameters_list, cost_list

## Function for calculating MSE

In [12]:
def mse(x, y, parameters):
    """
    Calculates and returns Mean Squared Error.
    Inputs: x (predictors' values), y (target values), parameters (parameter values).
    Output: MSE.
    """
    error = (predict(x, parameters) - y)**2
    mse = 1 / len(x) * np.sum(error, axis = 0)
    return float(mse)

## Function for prediction

In [13]:
def predict(x, parameters):
    """
    Predicts targets.
    Inputs: x (predictors' values), parameters (parameters' values).
    Output: predicted target.
    """
    return x.dot(parameters)

## Batch Gradient Descent

Since I don't know how many epochs it'll take to converge to good solution, I will set epochs to 1000000 and say the algo to stop when cost <= a certain value and it is called tolerance. By setting large number of epochs, I will let the algo to iterate enough times to converge to the expected solution.

In [14]:
epochs_bgd, parameters_bgd, costs_bgd = gradient_descent(X_train, y_train, batch_size = len(X_train), epochs = 1000000, learning_rate = 0.01, tolerance = 0.03)

In [15]:
mse(X_train, y_train, parameters_bgd[-1]), mse(X_test, y_test, parameters_bgd[-1])

(0.02999881480082053, 0.04089507329830574)

In [16]:
len(epochs_bgd)

2255

In [17]:
%store epochs_bgd parameters_bgd costs_bgd

Stored 'epochs_bgd' (list)
Stored 'parameters_bgd' (list)
Stored 'costs_bgd' (list)


## Stochastic Gradient Descent

In [34]:
epochs_sgd, parameters_sgd, costs_sgd = gradient_descent(X_train, y_train, batch_size = 1, epochs = 1000000, learning_rate = 0.01, tolerance = 0.03)

In [35]:
mse(X_train, y_train, parameters_sgd[-1]), mse(X_test, y_test, parameters_sgd[-1])

(0.030134784567817922, 0.04194576586559235)

In [36]:
len(epochs_sgd)

11

In [37]:
%store epochs_sgd parameters_sgd costs_sgd

Stored 'epochs_sgd' (list)
Stored 'parameters_sgd' (list)
Stored 'costs_sgd' (list)


## Minibatch Stochastic Gradient Descent

In [38]:
epochs_mbsgd, parameters_mbsgd, costs_mbsgd = gradient_descent(X_train, y_train, batch_size = 8, epochs = 10000000, learning_rate = 0.01, tolerance = 0.03)

In [39]:
mse(X_train, y_train, parameters_mbsgd[-1]), mse(X_test, y_test, parameters_mbsgd[-1])

(0.030021958747444896, 0.04034297412385732)

In [40]:
len(epochs_mbsgd)

106

In [42]:
%store epochs_mbsgd parameters_mbsgd costs_mbsgd

Stored 'epochs_mbsgd' (list)
Stored 'parameters_mbsgd' (list)
Stored 'costs_mbsgd' (list)
