# Part 1.1 - Optimizing test accuracy as a function of learning rate and maximum iterations

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Importing standard modules
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import lr

In [11]:
train = pd.read_csv('../data/diabetes/diabetes_train.csv')
test = pd.read_csv('../data/diabetes/diabetes_test.csv')
valid = pd.read_csv('../data/diabetes/diabetes_val.csv')
all_data = pd.concat([train, test, valid], axis=0)

In [3]:
# Function to find convergent solution of gradient descent as a function of learning-rate and maximum iterations

def optimize(max_iter, learning_rate, pred_data):
    
    # Input data, features and binary labels column
    Xin = train.drop('Outcome',axis=1).to_numpy()
    Yin = train['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    # Iterate through the input parameters
    for l in learning_rate:
        
        for m in max_iter:
            
            #print("MAX ITERATIONS: ", m)
            print(f'LEARNING RATE: {l} \n')
            model = lr.LogisticRegression(verbose=True, add_bias=True, learning_rate=l, max_iters=m)
            yh = model.fit(Xin,Yin).predict(Xp)
            T = []
            T.append(yh)
        
            # Decision Boundary
            prediction = []
            for x in np.array(T).ravel():
                if x < 0.5: prediction.append(0)
                else: prediction.append(1)

            #print(T)
            #print(prediction)
            #print(Yp)
            print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))

In [None]:
# Best set of parameters so far:
m = [1e5, 5e5]
l = [2e-4, 5e-4]

optimize(max_iter=m, learning_rate=l, pred_data=test)

In [5]:
# This function determines the accuracy the Sklearn logisitic classifier can achieve

def compareSK(pred_data):
    
    # Input data, features and binary labels column
    Xin = train.drop('Outcome',axis=1).to_numpy()
    Yin = train['Outcome'].to_numpy()
    
    # Prediction data, validation set
    Xp = pred_data.drop('Outcome',axis=1).to_numpy()
    Yp = pred_data['Outcome'].to_numpy().ravel()
    
    model = SGDClassifier(max_iter=1e6, alpha=0.0002)
    yh = model.fit(Xin,Yin).predict(Xp)
    
    T = []
    T.append(yh)
    print(T)
    
    # Decision Boundary
    prediction = []
    for x in np.array(T).ravel():
        if x < 0.5: prediction.append(0)
        else: prediction.append(1)
            
    print(Yp)
    
    print("Accuracy Score:", accuracy_score(Yp,np.array(prediction).ravel()))

In [6]:
compareSK(pred_data=test)

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])]
[0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1]
Accuracy Score: 0.6911764705882353


## Results & Remarks

* Our base model preformed better than the Sklearn SGD logisitic classifier with a margin greater than 10%

* Decision Boundary is set at X=0.5: $$P(y=1|X) < 0.5 \rightarrow \hat{y}=0$$ $$P(y=1|X) \geq 0.5 \rightarrow \hat{y}= 1$$ 


* Best accuracy achieved on the test set was 77.9% with $\alpha = 2 * 10^{-4}$ and $i_{max} = 1 * 10^6$

* Output (including weights and magnitude of gradient vector):

```
                    LEARNING RATE: 2e-4
                    
                    1000000 Iterations
                    Norm of gradient: 0.013429351710347736

                    Weights: [ 1.29152029e-01  2.50264215e-02 -1.86834650e-02 -2.52562870e-03
                      4.12691159e-04  4.13164885e-02  5.23867127e-01  6.70382786e-04
                     -4.46242101e+00]

                    Accuracy Score: 0.7794117647058824
                    
                    -----------------------------------------------------------------------------
                    
                    LEARNING RATE: 9e-05 

                    5000000 Iterations
                    Norm of gradient: 0.005435090157130823

                    Weights: [ 1.36309194e-01  3.11128368e-02 -1.48346749e-02 -3.04513818e-03
                     -8.58120779e-05  6.64128026e-02  7.09122848e-01  6.22651319e-03
                     -6.61628338e+00]

                    Accuracy Score: 0.7794117647058824
```

# Part 1.2 - Implementing mini-batch stochastic gradient descent

- Mini-batch gradient descent relies on splitting the training data into batches and running the gradient descent algorithm on each set of data

In [51]:
def BatchData(data: pd.DataFrame, size: int, b=0, with_num=False):
    """
    Split the input data into discrete batches.
    
    data: [pd.DataFrame] This is the training data as input
    size: [int]          Size of batches in units of data points
    b:    [int]          Indicates how many batches of the data will be returned
    
    returns batched data
    """
    # Split the data into features and labels
    X = data.drop('Outcome', axis=1)
    Y = data['Outcome']
    
    # Iterate through data and split it based on batch size and number of batches needed
    # Function can handle iterating through the entire dataset or for certain number of batches
    if with_num:
        for x in range(0, b*size, size):
            yield X[x : min(x + size, b*size)], Y[x : min(x + size, b*size)]
    
    if not with_num:      
        for x in range(0, len(data), size):
            yield X[x : min(x + size, len(data))], Y[x : min(x + size, len(data))]

In [49]:
# Test batching for entire data set
for x in BatchData(train, 2):
    print(f'{x}\n\n')

(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            5      144             82             26      285  32.0   
1            0      128             68             19      180  30.5   

   DiabetesPedigreeFunction  Age  
0                     0.452   58  
1                     1.391   25  , 0    1
1    1
Name: Outcome, dtype: int64)


(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
2            9      156             86             28      155  34.3   
3            1      144             82             46      180  46.1   

   DiabetesPedigreeFunction  Age  
2                     1.189   42  
3                     0.335   46  , 2    1
3    1
Name: Outcome, dtype: int64)


(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
4            0      179             90             27        0  44.1   
5            1      136             74             50      204  37.4   

   DiabetesPedigreeFunction  Age  
4        


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
218            6      151             62             31      120  35.5   
219            9      165             88              0        0  30.4   

     DiabetesPedigreeFunction  Age  
218                     0.692   28  
219                     0.302   49  , 218    0
219    1
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
220            2      109             92              0        0  42.7   
221            3      150             76              0        0  21.0   

     DiabetesPedigreeFunction  Age  
220                     0.845   54  
221                     0.207   37  , 220    0
221    0
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
222            4      154             62             31      284  32.8   
223            8      133             72              0        0  32.9   

     


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
388            3       99             62             19       74  21.8   
389            1      122             64             32      156  35.1   

     DiabetesPedigreeFunction  Age  
388                     0.279   26  
389                     0.692   30  , 388    0
389    1
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
390            1       79             80             25       37  25.4   
391            3      173             82             48      465  38.4   

     DiabetesPedigreeFunction  Age  
390                     0.583   22  
391                     2.137   25  , 390    0
391    1
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
392            5       96             74             18       67  33.6   
393            5      158             70              0        0  29.8   

     


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
576            0      101             65             28        0  24.6   
577            1       77             56             30       56  33.3   

     DiabetesPedigreeFunction  Age  
576                     0.237   22  
577                     1.251   24  , 576    0
577    0
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
578            1      117             88             24      145  34.5   
579            2       88             58             26       16  28.4   

     DiabetesPedigreeFunction  Age  
578                     0.403   40  
579                     0.766   22  , 578    1
579    0
Name: Outcome, dtype: int64)


(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
580            3      111             58             31       44  29.5   
581            2      155             74             17       96  26.6   

     

In [50]:
# Test batching for discrete batches
for x in BatchData(train, 3, b=2, with_num=True):
    print(f'{x}\n\n')

(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            5      144             82             26      285  32.0   
1            0      128             68             19      180  30.5   
2            9      156             86             28      155  34.3   

   DiabetesPedigreeFunction  Age  
0                     0.452   58  
1                     1.391   25  
2                     1.189   42  , 0    1
1    1
2    1
Name: Outcome, dtype: int64)


(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
3            1      144             82             46      180  46.1   
4            0      179             90             27        0  44.1   
5            1      136             74             50      204  37.4   

   DiabetesPedigreeFunction  Age  
3                     0.335   46  
4                     0.686   23  
5                     0.399   24  , 3    1
4    1
5    0
Name: Outcome, dtype: int64)


