# Neural Network Classification
## Diabetes data

In [1]:
### load the data
import pandas as pd

df = pd.read_csv('data/diabetes.csv')
df = df.drop(columns=['row'])
print(df.head())
print('\nDimensions of data frame:', df.shape)

   pregnant  glucose  pressure  triceps  insulin  mass  pedigree  age diabetes
0         6    148.0      72.0     35.0      NaN  33.6     0.627   50      pos
1         1     85.0      66.0     29.0      NaN  26.6     0.351   31      neg
2         8    183.0      64.0      NaN      NaN  23.3     0.672   32      pos
3         1     89.0      66.0     23.0     94.0  28.1     0.167   21      neg
4         0    137.0      40.0     35.0    168.0  43.1     2.288   33      pos

Dimensions of data frame: (768, 9)


In [2]:
# fill NAs (same code as shown in 21_data cleaning)
import numpy as np

# fill triceps NAs with mean
tri_mean = np.mean(df.triceps)
df.triceps.fillna(tri_mean, inplace=True)

# fill insulin NAs with mean
insulin_mean = np.mean(df.insulin)
df.insulin.fillna(insulin_mean, inplace=True)

# drop remaining rows with NAs
df = df.dropna()
print('\nDimensions of data frame:', df.shape)



Dimensions of data frame: (724, 9)


In [3]:
# convert diabetes to categorical data type

df.diabetes = df.diabetes.astype('category').cat.codes

In [4]:
# train test split
from sklearn.model_selection import train_test_split

X = df.iloc[:, 0:7]
y = df.iloc[:, 8]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

print('train size:', X_train.shape)
print('test size:', X_test.shape)

train size: (579, 7)
test size: (145, 7)


### Baseline: logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
# make predictions

pred = clf.predict(X_test)

In [7]:
# confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score

print('accuracy = ', accuracy_score(y_test, pred))
confusion_matrix(y_test, pred)

accuracy =  0.8620689655172413


array([[100,   6],
       [ 14,  25]])

### Compare to Neural Network

In [8]:
# normalize the data
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# train 
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5, 2), max_iter=500, random_state=1234)
clf.fit(X_train_scaled, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 2), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1234, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [10]:
# make predictions

pred = clf.predict(X_test_scaled)

In [11]:
# output results

print('accuracy = ', accuracy_score(y_test, pred))

confusion_matrix(y_test, pred)

accuracy =  0.8482758620689655


array([[95, 11],
       [11, 28]])

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       106
           1       0.72      0.72      0.72        39

    accuracy                           0.85       145
   macro avg       0.81      0.81      0.81       145
weighted avg       0.85      0.85      0.85       145



In [13]:
# try different settings

clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(3,), max_iter=1500, random_state=1234)
clf.fit(X_train_scaled, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(3,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1234, shuffle=True, solver='sgd',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [14]:
# make predictions
pred = clf.predict(X_test_scaled)

print('accuracy = ', accuracy_score(y_test, pred))

# confusion matrix
confusion_matrix(y_test, pred)

accuracy =  0.8620689655172413


array([[98,  8],
       [12, 27]])

In [15]:


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       106
           1       0.77      0.69      0.73        39

    accuracy                           0.86       145
   macro avg       0.83      0.81      0.82       145
weighted avg       0.86      0.86      0.86       145

