# Processing Diabetes DataSet with Logistic Regression

In [30]:
from costFunction import costFunction
from gradient import gradient
from featureNormalize import featureNormalize
from sigmoid import sigmoid

import numpy as np
import pandas as pd
import scipy.optimize as op
# used to split data in train and test sets
from sklearn.model_selection import train_test_split
# for accuracy metrics
from sklearn.metrics import accuracy_score

# precision, recall & f1 score
from sklearn.metrics import precision_recall_fscore_support

## Loading and normalizing features

In [7]:
# Load an example dataset that we will be using
file_path = '../../data/1_diabetes.csv'
df = pd.read_csv(file_path)

features = df.drop('Outcome', axis=1)
labels = df['Outcome']

X = features.as_matrix()
y = labels.as_matrix().reshape(-1, 1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

m, n = X_train.shape

#normalise features
X_norm, mu, sigma = featureNormalize(X_train)

# add bias
X_norm = np.c_[np.ones((m, 1)), X_norm]

## Running Regularized Logistic Regression

In [41]:
# Initialize fitting parameters
initial_theta = np.zeros((n + 1, 1))

# Set regularization parameter lambda to 1
reg_lambda = 1

maxiter = 100

# Optimize
Result = op.minimize(fun = costFunction, x0 = initial_theta, args = (X_norm, y_train, reg_lambda, True), method = 'TNC', 
                     jac = gradient, options={'disp': True, 'maxiter' : maxiter})
optimal_theta = Result.x

threshold = 0.8

def predict(theta, X, threshold = 0.5):    
    return sigmoid(np.dot(X, theta)) >= threshold 

p = predict(optimal_theta, X_norm)
print('\nTraining Set Accuracy: ', accuracy_score(y_train, p) * 100, '\n')


Training Set Accuracy:  77.7611940299 



## Test

In [32]:
normalized_test_data = np.divide(X_test - mu, sigma)
p = predict(optimal_theta, np.c_[np.ones((normalized_test_data.shape[0], 1)), normalized_test_data], threshold)
print('\nTest Set Accuracy: ', accuracy_score(y_test, p) * 100, '\n')
precision,  recall,  f1_score,  support = precision_recall_fscore_support(y_test, p, average='macro')
print("F1 Score = ", f1_score)


Test Set Accuracy:  70.4545454545 

F1 Score =  0.559184680041
