# Part 1

## Todo v0.1

- [x] Organize data from diabetes dataset into training, validation, and testing sets
- [x] Set up logistic regression env
- [x] Initial model run

In [5]:
# Importing standard modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from typing import List

import logisticregression as logreg

In [2]:
train = pd.read_csv('../data/diabetes/diabetes_train.csv')
test = pd.read_csv('../data/diabetes/diabetes_test.csv')
valid = pd.read_csv('../data/diabetes/diabetes_val.csv')

train.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5,144,82,26,285,32.0,0.452,58,1
1,0,128,68,19,180,30.5,1.391,25,1
2,9,156,86,28,155,34.3,1.189,42,1
3,1,144,82,46,180,46.1,0.335,46,1
4,0,179,90,27,0,44.1,0.686,23,1
5,1,136,74,50,204,37.4,0.399,24,0
6,13,104,72,0,0,31.2,0.465,38,1
7,2,125,60,20,140,33.8,0.088,31,0
8,1,95,82,25,180,35.0,0.233,43,1
9,4,184,78,39,277,37.0,0.264,31,1


In [3]:
frames = [train, test, valid]
all_data = pd.concat(frames, axis=0)

all_data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5,144,82,26,285,32.0,0.452,58,1
1,0,128,68,19,180,30.5,1.391,25,1
2,9,156,86,28,155,34.3,1.189,42,1
3,1,144,82,46,180,46.1,0.335,46,1
4,0,179,90,27,0,44.1,0.686,23,1
5,1,136,74,50,204,37.4,0.399,24,0
6,13,104,72,0,0,31.2,0.465,38,1
7,2,125,60,20,140,33.8,0.088,31,0
8,1,95,82,25,180,35.0,0.233,43,1
9,4,184,78,39,277,37.0,0.264,31,1


In [4]:
# Split all-data columns intro features (from first to before-last) and the prediction column (last column)
X = all_data.iloc[:,1:]
Y = all_data.iloc[:,0]

# Model data split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

maxIters = [1e4, 9e4, 1e5]
learningR = [.1, .2, .3]

for l, i in zip(learningR, maxIters):
    # Define the model type
    model = logreg.LogisticRegression(verbose=True, add_bias=True,learning_rate=l,max_iters=i)
    # Target variable
    yh = model.fit(X_train,Y_train).predict(X_train)

terminated after 10000 iterations, with norm of the gradient equal to 483.9734762463032
the weight found: [354905.38645835 203902.79079864  52567.01180555 207254.08619795
  88732.84278644   1250.44168993 114465.20460069   1293.42005208
   2786.50833333]
terminated after 90000 iterations, with norm of the gradient equal to 483.9734762463032
the weight found: [6388199.6618088  3670194.47049114  946189.5791685  3730508.17240151
 1597165.68557488   22507.57782432 2060347.07587256   23281.28454857
   50156.34999995]
terminated after 100000 iterations, with norm of the gradient equal to 483.9734762463032
the weight found: [10646997.40938941  6116989.62240224  1576982.28541548  6217512.25858251
  2661942.27835531    37512.62194481  3433911.23879684    38802.13515625
    83593.9       ]


In [14]:
# Function to find convergent solution of gradient descent as a function of learning-rate and maximum iterations

def optimize(max_iter, learning_rate, data):
    
    # Input data, features and binary labels column
    Xin=data.iloc[:,1:]
    Yin=data.iloc[:,0]
    
    # Split data into training and validation data sets
    X_train, X_valid, Y_train, Y_valid = train_test_split(Xin, Yin, test_size=0.2, random_state=0)
    
    # Iterate through the input parameters
    for l in learning_rate:
        print("Learning Rate: ", l)
        for m in max_iter:
            print("Maximum Iterations: ", m)
            model = logreg.LogisticRegression(verbose=True, add_bias=True, learning_rate=l, max_iters=m)
            yh = model.fit(X_train,Y_train).predict(X_valid)


mi = [1e4,9e4,1e5]
lr = [.1,1,10,0.001]

optimize(max_iter=mi, learning_rate=lr, data=all_data)
    

Learning Rate:  0.1
Maximum Iterations:  10000.0
terminated after 10000 iterations, with norm of the gradient equal to 478.7096494282524
the weight found: [351709.65814331 201221.72475567  51518.94902281 204152.30016286
  88352.89893322   1264.98919886 113276.90325735   1285.03330619
   2768.77964169]
Maximum Iterations:  90000.0
terminated after 90000 iterations, with norm of the gradient equal to 478.7096494282524
the weight found: [3165338.3226423  1810967.65309788  463662.27149879 1837337.96791318
  795163.32238563   11384.71558325 1019478.85765578   11565.16359934
   24918.61677524]
Maximum Iterations:  100000.0
terminated after 100000 iterations, with norm of the gradient equal to 478.7096494282524
the weight found: [3517041.90570493 2012185.89414127  515180.18680831 2041486.17638187
  883514.62531718   12649.6813813  1132754.10195494   12850.17988598
   27687.34641692]
Learning Rate:  1
Maximum Iterations:  10000.0
terminated after 10000 iterations, with norm of the gradient equ