# Processing Diabetes DataSet with Neural Networks

In [57]:
from nnCostFunction import nnCostFunction
from sigmoid import sigmoid
from sigmoidGradient import sigmoidGradient
from randInitializeWeights import randInitializeWeights
from checkNNGradients import checkNNGradients
from featureNormalize import featureNormalize


import scipy.optimize as op
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# used to split data in train and test sets
from sklearn.model_selection import train_test_split
# for accuracy metrics
from sklearn.metrics import accuracy_score
# precision, recall & f1 score
from sklearn.metrics import precision_recall_fscore_support

## Loading and normalizing features

In [2]:
# Load an example dataset that we will be using
file_path = '../../data/1_diabetes.csv'
df = pd.read_csv(file_path)

features = df.drop('Outcome', axis=1)
labels = df['Outcome']

X = features.as_matrix()
y = labels.as_matrix().reshape(-1, 1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

m, n = X_train.shape

#normalise features
X_norm, mu, sigma = featureNormalize(X_train)

# add bias
X_norm = np.c_[np.ones((m, 1)), X_norm]

## Setup NNs parameters

In [52]:
input_layer_size  = n + 1 # n = number of features
hidden_layer_size = 30    # 25 hidden units
num_labels = 2            # 2 labels,  0 or 1
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

# Unroll parameters
initial_nn_params = np.r_[initial_Theta1.ravel(), initial_Theta2.ravel()]    

##  Training NN

In [75]:
print('\nTraining Neural Network... \n')

# regularization rate
reg_lambda = 0

# iterations count
max_iter = 200

# Short hand for cost function
costFunc = lambda params: nnCostFunction(params, input_layer_size, hidden_layer_size, num_labels, X_norm, y_train, reg_lambda,
                                         returnOnlyCost=True)
gradFunc = lambda params: nnCostFunction(params, input_layer_size, hidden_layer_size, num_labels, X_norm, y_train, reg_lambda, 
                                         returnOnlyGrad=True, flattenResult=True)

# Run fmincg to obtain the optimal theta
Result = op.minimize(fun = costFunc, x0 = initial_nn_params, method = 'TNC', jac = gradFunc, 
                     options={'maxiter' : max_iter, 'disp': True})

optimal_nn_params = Result.x

# Obtain Theta1 and Theta2 back from nn_params
Theta1 = np.reshape(optimal_nn_params[0:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, input_layer_size + 1))
Theta2 = np.reshape(optimal_nn_params[(hidden_layer_size * (input_layer_size + 1)):], (num_labels, hidden_layer_size + 1))

print('Neural Network trained successfully... \n')


Training Neural Network... 

Neural Network trained successfully... 



## Predict


In [82]:
def predict(Theta1, Theta2, X, threshold):
    m = X.shape[0]  
    h1 = sigmoid(np.c_[np.ones((m, 1)), X].dot(Theta1.T))
    h2 = sigmoid(np.c_[np.ones((m, 1)), h1].dot(Theta2.T))   
    # each row is  (probability to be 0, probability to be 1)
    # so we take the prob to be 1 and compare it to threshold
    return h2[:, 1] >= threshold

threshold = 0.99 # prob from it we consider the patient to be diabetic 

p = predict(Theta1, Theta2, X_norm, threshold).astype(int).reshape(X_norm.shape[0], 1)
print('\nTraining Set Accuracy: ', accuracy_score(y_train, p) *100, '\n')

normalized_test_data = np.divide(X_test - mu, sigma)
test_data = np.c_[np.ones((normalized_test_data.shape[0], 1)), normalized_test_data]
p = predict(Theta1, Theta2,test_data , threshold).astype(int).reshape(normalized_test_data.shape[0], 1)
print('\nTest Set Accuracy: ', accuracy_score(y_test, p) *100, '\n')

precision,  recall,  f1_score,  support = precision_recall_fscore_support(y_test, p, average='macro')
print("F1 Score = ", f1_score)


Training Set Accuracy:  67.0895522388 


Test Set Accuracy:  68.1818181818 

F1 Score =  0.44714798564
