# Part 1.1 - Optimizing test accuracy as a function of learning rate and maximum iterations

In [5]:
import warnings
warnings.filterwarnings('ignore')

# Importing standard modules
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import lr

In [6]:
# Importing csv data
train = pd.read_csv('../data/diabetes/diabetes_train.csv')
test = pd.read_csv('../data/diabetes/diabetes_test.csv')
valid = pd.read_csv('../data/diabetes/diabetes_val.csv')
all_data = pd.concat([train, test, valid], axis=0)

In [7]:
# Function to find convergent solution of gradient descent as a function of learning-rate and maximum iterations

def optimize(data, max_iter, learning_rate, pred_data):
    """
    Optimize a logisitic regression model's weights using gradient descent.
    This function makes it easier to test different combinations of the
    maximum iterations and learning rate parameters. 
    data:           [pd.DataFrame]  Training or Training+Validation data
    max_iter:       [int]           Maximum Iterations of gradient descent
    learning_rate:  [float]
    pred_data:      [pd.DataFrame]
    returns array of class probabilities
    """
    A = []

    # Input data, features and binary labels column
    Xin = data.drop('Outcome',axis=1).to_numpy()
    Yin = data['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    # Iterate through the input parameters
    for l in learning_rate:
        
        for m in max_iter:
            
            print(f'LEARNING RATE: {l} \n')
            
            model = lr.LogisticRegression(verbose=True, add_bias=True, learning_rate=l, max_iters=m)
            yh = model.fit(Xin,Yin).predict(Xp)

            T = []
            T.append(yh)
        
            # Decision Boundary
            prediction = []
            for x in np.array(T).ravel():
                if x < 0.5: prediction.append(0)
                else: prediction.append(1)

            print(T)
            #print(prediction)
            #print(Yp)
            A.append(accuracy_score(Yp,np.array(prediction).ravel()))
            print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))
    
    print(f'Maximum Accuracy achieved: {max(A)}')



In [8]:
# Best set of parameters so far:
m = [1e5, 5e5]
l = [2e-4, 5e-4]

optimize(data=train, max_iter=m, learning_rate=l, pred_data=test)

LEARNING RATE: 0.0002 

100000 Iterations
Norm of gradient: 0.032481313341255064

Weights: [ 0.13041944  0.01501208 -0.02859516 -0.00134487  0.00140752  0.0009135
  0.12273586 -0.01057542 -0.67602891]

[array([0.22659705, 0.72690719, 0.12583064, 0.3864539 , 0.34937549,
       0.21811135, 0.35798839, 0.47084378, 0.55396846, 0.61350552,
       0.53904454, 0.43609593, 0.23355464, 0.15264202, 0.2462541 ,
       0.53616467, 0.68464705, 0.24263728, 0.23450751, 0.37597622,
       0.30324803, 0.51924952, 0.41800686, 0.33649247, 0.54426409,
       0.3417733 , 0.17492912, 0.33897865, 0.22201489, 0.27160167,
       0.18310951, 0.32863758, 0.17256134, 0.24543173, 0.21742111,
       0.27112872, 0.45277014, 0.47662885, 0.30831409, 0.56214965,
       0.34283293, 0.52443952, 0.16164615, 0.39526411, 0.39721835,
       0.23469553, 0.66927167, 0.30174203, 0.53025539, 0.62687566,
       0.26725662, 0.44597562, 0.25110384, 0.33631461, 0.52795333,
       0.24076667, 0.30019527, 0.34907948, 0.35240309, 0.361

In [5]:
# This function determines the accuracy the Sklearn logisitic classifier can achieve

def compareSK(pred_data):
    
    # Input data, features and binary labels column
    Xin = train.drop('Outcome',axis=1).to_numpy()
    Yin = train['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    model = SGDClassifier(max_iter=1e6, alpha=0.0002)
    yh = model.fit(Xin,Yin).predict(Xp)
    
    T = []
    T.append(yh)
    print(T)
    
    # Decision Boundary
    prediction = []
    for x in np.array(T).ravel():
        if x < 0.5: prediction.append(0)
        else: prediction.append(1)
            
    print(Yp)
    
    print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))

In [6]:
compareSK(pred_data=test)

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])]
[0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1]
Accuracy Score: 0.6911764705882353


## Results & Remarks

* Our base model preformed better than the Sklearn SGD logisitic classifier with a margin greater than 10%

* Decision Boundary is set at X=0.5: $$P(y=1|X) < 0.5 \rightarrow \hat{y}=0$$ $$P(y=1|X) \geq 0.5 \rightarrow \hat{y}= 1$$ 


* Best accuracy achieved on the test set was 77.9% with $\alpha = 2 * 10^{-4}$ and $i_{max} = 1 * 10^6$

* Output (including weights and magnitude of gradient vector):

```
                    LEARNING RATE: 2e-4
                    
                    1000000 Iterations
                    Norm of gradient: 0.013429351710347736

                    Weights: [ 1.29152029e-01  2.50264215e-02 -1.86834650e-02 -2.52562870e-03
                      4.12691159e-04  4.13164885e-02  5.23867127e-01  6.70382786e-04
                     -4.46242101e+00]

                    Accuracy Score: 0.7794117647058824
                    
                    -----------------------------------------------------------------------------
                    
                    LEARNING RATE: 9e-05 

                    5000000 Iterations
                    Norm of gradient: 0.005435090157130823

                    Weights: [ 1.36309194e-01  3.11128368e-02 -1.48346749e-02 -3.04513818e-03
                     -8.58120779e-05  6.64128026e-02  7.09122848e-01  6.22651319e-03
                     -6.61628338e+00]

                    Accuracy Score: 0.7794117647058824
```

# Part 1.2 - Implementing mini-batch stochastic gradient descent

- Mini-batch gradient descent relies on splitting the training data into batches and running the gradient descent algorithm on each set of data

In [4]:
def BatchData(data: pd.DataFrame, size: int, b=0, with_num=False, split=False):
    """
    Splits the data into specific batches. Allows user to batch by features and labels
    or batch the entire dataset. Also, allows the user to have a certain number of batches
    and not batch the entire data set.
    
    data:     [pd.DataFrame]  This is the training data as input
    size:     [int]           Size of batches in units of data points
    b:        [int]           Indicates how many batches of the data will be returned
    with_num: [bool]          Lets user decide how many batches it should generate
    
    returns batched data
    """
    if split:
        # Split the data into features and labels
        X = data.drop('Outcome', axis=1)
        Y = data['Outcome']

        # Iterate through data and split it based on batch size and number of batches needed
        # Function can handle iterating through the entire dataset or for certain number of batches
        if with_num:
            for x in range(0, b*size, size):
                yield X[x : min(x + size, b*size)], Y[x : min(x + size, b*size)]

        else:      
            for x in range(0, len(data), size):
                yield X[x : min(x + size, len(data))], Y[x : min(x + size, len(data))]
                           
    else:
        if with_num:
            for x in range(0, b*size, size):
                yield data[x : min(x + size, b*size)]

        else:      
            for x in range(0, len(data), size):
                yield data[x : min(x + size, len(data))]

In [None]:
# Test batching for entire data set
for x in BatchData(train, 2):
    print(f'{x}\n\n')

In [10]:
# Test batching for discrete batches
for batch in BatchData(train, 32,split=False):
    print(f'{batch}\n\n')

    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0             5      144             82             26      285  32.0   
1             0      128             68             19      180  30.5   
2             9      156             86             28      155  34.3   
3             1      144             82             46      180  46.1   
4             0      179             90             27        0  44.1   
5             1      136             74             50      204  37.4   
6            13      104             72              0        0  31.2   
7             2      125             60             20      140  33.8   
8             1       95             82             25      180  35.0   
9             4      184             78             39      277  37.0   
10            7      103             66             32        0  39.1   
11            8      155             62             26      495  34.0   
12            2      122             60            

In [25]:
def miniBatchGD(data, BatchSize, epochs=1, All_data=True, NumBatches=0):
    """
    Run gradient descent optimization in mini-batches
    
    data:       [pd.DataFrame]  Input data
    BatchSize:  [int]           Size of mini-batch
    epoch:      [int]           Number of iterations over the batches
    NumBatches: [int]           How many batches it should generate
    All_data:   [bool]          If the user wants to batch the entire dataset
    
    returns weights and accuracy on testing set with mini-batch
    
    """
    if All_data:
        for epoch in range(1, epochs+1):
            print(f'Epoch: {epoch}')
            for batch in BatchData(data, BatchSize, split=False):
                optimize(data=batch, max_iter=[1e6], learning_rate=[2e-4], pred_data=test)
            
        

In [None]:
miniBatchGD(data=train, BatchSize=75)