# Part 1.1 - Optimizing test accuracy as a function of learning rate and maximum iterations

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Importing standard modules
import numpy as np
import pandas as pd
import lr
import lr_m        # momentum version

# SKlearn
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
# Importing csv data
train = pd.read_csv('../data/diabetes/diabetes_train.csv')
test = pd.read_csv('../data/diabetes/diabetes_test.csv')
valid = pd.read_csv('../data/diabetes/diabetes_val.csv')
all_data = pd.concat([train, test, valid], axis=0)

In [3]:
# Function to find convergent solution of gradient descent as a function of learning-rate and maximum iterations

def optimize(data, max_iter, learning_rate, pred_data):
    """
    Optimize a logisitic regression model's weights using gradient descent.
    This function makes it easier to test different combinations of the
    maximum iterations and learning rate parameters. 
    data:           [pd.DataFrame]  Training or Training+Validation data
    max_iter:       [int]           Maximum Iterations of gradient descent
    learning_rate:  [float]
    pred_data:      [pd.DataFrame]
    returns array of class probabilities
    """
    A = []

    # Input data, features and binary labels column
    Xin = data.drop('Outcome',axis=1).to_numpy()
    Yin = data['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    # Iterate through the input parameters
    for l in learning_rate:
        
        for m in max_iter:
            
            print(f'LEARNING RATE: {l} \n')
            
            model = lr.LogisticRegression(verbose=True, add_bias=True, learning_rate=l, max_iters=m)
            yh = model.fit(Xin,Yin).predict(Xp)

            T = []
            T.append(yh)
        
            # Decision Boundary
            prediction = []
            for x in np.array(T).ravel():
                if x < 0.5: prediction.append(0)
                else: prediction.append(1)

            #print(T)
            #print(prediction)
            #print(Yp)
            A.append(accuracy_score(Yp,np.array(prediction).ravel()))
            print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))
    
    print(f'Maximum Accuracy achieved: {max(A)}')



In [4]:
# Best set of parameters so far:
m = [1e6] #3e5]
l = [2e-4]

optimize(data=train, max_iter=m, learning_rate=l, pred_data=test)

LEARNING RATE: 0.0002 

1000000 Iterations
Norm of gradient: 0.013429351710347736

Weights: [ 1.29152029e-01  2.50264215e-02 -1.86834650e-02 -2.52562870e-03
  4.12691159e-04  4.13164885e-02  5.23867127e-01  6.70382786e-04
 -4.46242101e+00]

Accuracy Score: 0.7794117647058824
Maximum Accuracy achieved: 0.7794117647058824


In [5]:
# This function determines the accuracy the Sklearn logisitic classifier can achieve

def compareSK(pred_data):
    
    # Input data, features and binary labels column
    Xin = train.drop('Outcome',axis=1).to_numpy()
    Yin = train['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    model = SGDClassifier(max_iter=1e6, alpha=0.0002)
    yh = model.fit(Xin,Yin).predict(Xp)
    
    T = []
    T.append(yh)
    print(T)
    
    # Decision Boundary
    prediction = []
    for x in np.array(T).ravel():
        if x < 0.5: prediction.append(0)
        else: prediction.append(1)
            
    print(Yp)
    
    print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))

In [6]:
compareSK(pred_data=test)

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])]
[0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1]
Accuracy Score: 0.6911764705882353


## Results & Remarks

* Our base model preformed better than the Sklearn SGD logisitic classifier with a margin greater than 10%

* Decision Boundary is set at X=0.5: $$P(y=1|X) < 0.5 \rightarrow \hat{y}=0$$ $$P(y=1|X) \geq 0.5 \rightarrow \hat{y}= 1$$ 


* Best accuracy achieved on the test set was 77.9% with $\alpha = 2 * 10^{-4}$ and $i_{max} = 1 * 10^6$

* Output (including weights and magnitude of gradient vector):

```
                    LEARNING RATE: 2e-4
                    
                    1000000 Iterations
                    Norm of gradient: 0.013429351710347736

                    Weights: [ 1.29152029e-01  2.50264215e-02 -1.86834650e-02 -2.52562870e-03
                      4.12691159e-04  4.13164885e-02  5.23867127e-01  6.70382786e-04
                     -4.46242101e+00]

                    Accuracy Score: 0.7794117647058824
                    
                    -----------------------------------------------------------------------------
                    
                    LEARNING RATE: 9e-05 

                    5000000 Iterations
                    Norm of gradient: 0.005435090157130823

                    Weights: [ 1.36309194e-01  3.11128368e-02 -1.48346749e-02 -3.04513818e-03
                     -8.58120779e-05  6.64128026e-02  7.09122848e-01  6.22651319e-03
                     -6.61628338e+00]

                    Accuracy Score: 0.7794117647058824
```

# Part 1.2 - Implementing mini-batch stochastic gradient descent

- Mini-batch gradient descent relies on splitting the training data into batches and running the gradient descent algorithm on each set of data

In [41]:
def BatchData(data: pd.DataFrame, size: int, b=0, with_num=False, split=False):
    """
    Splits the data into specific batches. Allows user to batch by features and labels
    or batch the entire dataset. Also, allows the user to have a certain number of batches
    and not batch the entire data set.
    
    data:     [pd.DataFrame]  This is the training data as input
    size:     [int]           Size of batches in units of data points
    b:        [int]           Indicates how many batches of the data will be returned
    with_num: [bool]          Lets user decide how many batches it should generate
    
    returns batched data
    """
    if split:
        # Split the data into features and labels
        X = data.drop('Outcome', axis=1)
        Y = data['Outcome']
        
        # Iterate through data and split it based on batch size and number of batches needed
        # Function can handle iterating through the entire dataset or for certain number of batches
        if with_num:
            for x in range(0, b*size, size):
                yield X[x : min(x + size, b*size)], Y[x : min(x + size, b*size)]

        else:      
            for x in range(0, len(data), size):
                yield X[x : min(x + size, len(data))], Y[x : min(x + size, len(data))]
                           
    else:
        if with_num:
            for x in range(0, b*size, size):
                yield data[x : min(x + size, b*size)]

        else:      
            for x in range(0, len(data), size):
                yield data[x : min(x + size, len(data))]

In [None]:
# Test batching for entire data set
for x in BatchData(train, 2):
    print(f'{x}\n\n')

In [None]:
# Test batching for discrete batches
for batch in BatchData(train, 32,split=False):
    print(f'{batch}\n\n')

In [5]:
def miniBatchGD(data, BatchSize, epochs=1, All_data=True, NumBatches=0):
    """
    Run gradient descent optimization in mini-batches
    
    data:       [pd.DataFrame]  Input data
    BatchSize:  [int]           Size of mini-batch
    epoch:      [int]           Number of iterations over the batches
    NumBatches: [int]           How many batches it should generate
    All_data:   [bool]          If the user wants to batch the entire dataset
    
    returns weights and accuracy on testing set with mini-batch
    
    """
    if All_data:
        for epoch in range(1, epochs+1):
            data = data.sample(frac=1)
            print(f'Epoch: {epoch}')
            for batch in BatchData(data, BatchSize, split=False):
                optimize(data=batch, max_iter=[1e6], learning_rate=[2e-4], pred_data=test)
            

### Note

* Mini-batch sizes < 75 rows usually yield lower accuracies than batch sizes > 75 rows
* Large mini-batch sizes outpreform full gradient descent

In [53]:
miniBatchGD(data=train, BatchSize=150, epochs=2)

Epoch: 1
LEARNING RATE: 0.0002 

1000000 Iterations
Norm of gradient: 0.015892992183576222

Weights: [ 1.20403970e-02  2.95064336e-02 -3.56167999e-02  2.06340238e-02
  1.25484151e-03  4.49286155e-02  4.90184967e-01  2.36078082e-02
 -5.09488582e+00]

Accuracy Score: 0.7205882352941176
Maximum Accuracy achieved: 0.7205882352941176
LEARNING RATE: 0.0002 

1000000 Iterations
Norm of gradient: 0.014758168905036687

Weights: [ 1.78509158e-01  1.75080633e-02 -1.82363998e-02 -1.79815919e-02
  1.13944485e-03  8.77708862e-02  4.35963845e-01 -2.97611228e-03
 -4.77279942e+00]

Accuracy Score: 0.7941176470588235
Maximum Accuracy achieved: 0.7941176470588235
LEARNING RATE: 0.0002 

1000000 Iterations
Norm of gradient: 0.01532299558079225

Weights: [ 2.17363378e-01  2.67336828e-02 -2.87669803e-03 -1.93292832e-02
  2.86537496e-03  1.62444479e-02  1.32195098e+00 -2.98533678e-02
 -4.46492917e+00]

Accuracy Score: 0.7352941176470589
Maximum Accuracy achieved: 0.7352941176470589
LEARNING RATE: 0.0002 

10

# Part 1.3: Momentum Gradient Descent

In [7]:
# Function to find convergent solution of gradient descent as a function of learning-rate and maximum iterations

def optimize_Momentum(data, max_iter, learning_rate, pred_data, momentum):
    """
    Optimize a logisitic regression model's weights using Momentum gradient descent.
    This function makes it easier to test different combinations of the
    maximum iterations and learning rate parameters. 
    data:           [pd.DataFrame]  Training or Training+Validation data
    max_iter:       [int]           Maximum Iterations of gradient descent
    learning_rate:  [float]
    pred_data:      [pd.DataFrame]
    momentum:       [int]
    returns array of class probabilities
    """
    A = []

    # Input data, features and binary labels column
    Xin = data.drop('Outcome',axis=1).to_numpy()
    Yin = data['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    # Iterate through the input parameters
    for l in learning_rate:
        
        for z in max_iter:
            
            print(f'LEARNING RATE: {l} \n')
            
            model = lr_m.LogisticRegression(verbose=True, add_bias=True, learning_rate=l, max_iters=z, momentum=momentum)
            yh = model.fit(Xin,Yin).predict(Xp)

            T = []
            T.append(yh)
        
            # Decision Boundary
            prediction = []
            for x in np.array(T).ravel():
                if x < 0.5: prediction.append(0)
                else: prediction.append(1)

            #print(T)
            #print(prediction)
            #print(Yp)
            A.append(accuracy_score(Yp,np.array(prediction).ravel()))
            print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))
    
    print(f'Maximum Accuracy achieved: {max(A)}')


In [12]:
# Best set of parameters so far:
m = [1e6, 5e6]
l = [2e-3]

for i in np.arange(0,1,0.1):
    print("Momentum value: " + str(i))
    print("-----------------------------------------------------------------------------------")
    optimize_Momentum(data=train, max_iter=m, learning_rate=l, pred_data=test, momentum = i)

Momentum value: 0.0
-----------------------------------------------------------------------------------
LEARNING RATE: 0.002 

1000000 Iterations
Norm of gradient: 57.63983246854505

Weights: [ 1.04850422e+00  2.54674993e-01 -7.98565076e-02 -2.87118401e-02
  5.14672817e-02  3.20682686e-01  4.66941524e+00  4.35855919e-02
 -4.01096719e+01]

Accuracy Score: 0.5882352941176471
LEARNING RATE: 0.002 

5000000 Iterations
Norm of gradient: 49.79002716281724

Weights: [ 9.60926472e-01  2.54100178e-01 -4.91331041e-02 -3.77763898e-02
  4.25735225e-02  5.12856204e-01  4.65508263e+00  7.35102628e-02
 -4.99702377e+01]

Accuracy Score: 0.6176470588235294
Maximum Accuracy achieved: 0.6176470588235294
Momentum value: 0.1
-----------------------------------------------------------------------------------
LEARNING RATE: 0.002 

1000000 Iterations
Norm of gradient: 44.769349655508634

Weights: [ 8.25065244e-01  1.47326359e-01 -8.94335489e-02 -3.63719547e-02
 -1.24739929e-02  2.94772782e-01  3.94536449e+00