# Diabetes Project Using Regression, Decision Tree and Neural Network
### Author: Xiao, Hanlin @ NTU MSBA July 2020

In [27]:
# data processing
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# regression and decision tree
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix

# neural network
from keras import backend as K
from keras.models import Input, Model
from keras.layers import Dense, Dropout
from keras.models import Model, Sequential

# ignore the warning
import warnings
warnings.filterwarnings("ignore")

import random
random.seed(233)

## 1. Data Cleaning

Now standardize the data - Age, Diastolic blood pressure

In [37]:
df_copy.dtypes

Number of times pregnant                     int64
Plasma glucose concentration a 2 hours       int64
Diastolic blood pressure                     int64
Triceps skin fold thickness                  int64
2-Hour serum insulin                         int64
Body mass index                            float64
Diabetes pedigree function                 float64
Age                                          int64
Class                                        uint8
dtype: object

In [2]:
df = pd.read_csv('/Users/cirean/Desktop/NTU/Diabetes_Project/Diabetes.csv')
df_copy = df.copy()
df_copy['Class'] = pd.get_dummies(df['Class'])['tested_positive']
df_copy.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,Class
0,6,148,72,35,0,33.6,0.627,30,1
1,1,85,66,29,0,26.6,0.351,45,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,58,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
errors = []
for i in df_copy['Age']:
    try:
        i = int(i)
    except:
        errors.append(i)
print(errors)

['22yo', 'twenty']


standardize 'Age'

In [5]:
df_copy['Age'] = df_copy['Age'].apply(lambda x: x.replace('22yo', '22').replace('twenty', '20')).astype(int)

In [6]:
errors = []
for i in df_copy['Diastolic blood pressure ']:
    try:
        i = int(i)
    except:
        errors.append(i)
print(errors)

['bin']


standardize Diastolic blood pressure

In [7]:
df_copy['Diastolic blood pressure '] = df['Diastolic blood pressure '].apply(lambda x: x.replace('bin', '0')).astype(int)

Now remove the outliers - Age, Diastolic blood pressure 

In [8]:
df_copy['Age'].value_counts().sort_index()

4       2
9       1
20      1
21     63
22     72
23     38
24     46
25     48
26     32
27     32
28     35
29     29
30     22
31     23
32     16
33     17
34     14
35     10
36     16
37     19
38     16
39     12
40     13
41     22
42     18
43     13
44      8
45     16
46     13
47      6
48      5
49      5
50      7
51      8
52      9
53      5
54      6
55      4
56      3
57      5
58      7
59      3
60      5
61      2
62      4
63      4
64      1
65      3
66      4
67      3
68      1
69      2
70      1
72      1
81      1
135     1
Name: Age, dtype: int64

to remove outliers 4, 9, 135

In [9]:
df_copy = df_copy[~df_copy.Age.isin([4,9,135])]

In [10]:
df_copy['Diastolic blood pressure '].astype(float).value_counts().sort_index()

0.0      36
38.0      1
40.0      1
44.0      4
46.0      2
48.0      5
50.0     13
52.0     11
54.0     11
55.0      2
56.0     12
58.0     22
60.0     37
61.0      1
62.0     34
64.0     43
65.0      7
66.0     29
67.0      1
68.0     44
70.0     57
72.0     44
74.0     52
75.0      9
76.0     39
78.0     45
80.0     39
82.0     29
84.0     23
85.0      6
86.0     21
88.0     25
90.0     22
92.0      8
94.0      6
95.0      1
96.0      4
98.0      3
100.0     3
102.0     1
104.0     2
106.0     3
108.0     2
110.0     3
114.0     1
122.0     1
321.0     1
432.0     1
623.0     1
730.0     1
Name: Diastolic blood pressure , dtype: int64

based on domain knowledge, Diastolic blood pressure is unlikely higher than 150 and lower than 30, so remove the outliers

In [11]:
df_copy = df_copy.drop(df_copy[(df_copy['Diastolic blood pressure ']>150) | (df_copy['Diastolic blood pressure ']<30)].index)

In [12]:
df_copy['Diastolic blood pressure '].astype(int).value_counts().sort_index()

38      1
40      1
44      4
46      2
48      5
50     13
52     11
54     11
55      2
56     12
58     22
60     37
61      1
62     34
64     43
65      7
66     29
67      1
68     44
70     57
72     44
74     52
75      9
76     39
78     45
80     39
82     29
84     23
85      6
86     21
88     25
90     22
92      8
94      6
95      1
96      4
98      3
100     3
102     1
104     2
106     3
108     2
110     3
114     1
122     1
Name: Diastolic blood pressure , dtype: int64

change bmi=0 observations into bmi_avg

In [13]:
bmi_avg = df_copy['Body mass index '][df_copy['Body mass index ']>0].mean()
df_copy['Body mass index '] = df_copy['Body mass index '].replace(0,bmi_avg)

In [14]:
X = df_copy.drop('Class', axis=1)
y = df_copy.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [15]:
scaler = preprocessing.MinMaxScaler().fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

## 2. Logistic Regression

In [16]:
m1 = LogisticRegression()
m1.fit(scaled_X_train, y_train)
y_pred = m1.predict(scaled_X_test)

In [17]:
print(m1.coef_, m1.intercept_)

[[1.93464559 4.44553065 0.11054553 0.80752159 0.50758589 2.32225265
  1.3272149  1.11119305]] [-5.06032355]


In [18]:
import statsmodels.api as sm
from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm

m1_2 = sm.Logit(y_train.to_numpy(),sm.add_constant(scaled_X_train)).fit()
print(m1_2.summary())

Optimization terminated successfully.
         Current function value: inf
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  510
Model:                          Logit   Df Residuals:                      501
Method:                           MLE   Df Model:                            8
Date:                Fri, 31 Jul 2020   Pseudo R-squ.:                     inf
Time:                        02:10:31   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -7.5424      0.711    -10.611      0.000      -8.936      -6.149
x1             7.1250      1.894  

In [35]:
print('The accuracy rate of logistic regression is {}'.format(round(sum(y_pred==y_test)/len(y_test),3)))

The accuracy rate of logistic regression is 0.758


In [36]:
cm1 = confusion_matrix(y_test, y_pred)
cm1 = cm1.astype('float') / cm1.sum()
print(cm1)

[[0.60273973 0.05936073]
 [0.1826484  0.15525114]]


### Pros:
    1.The model is easy to train and interpret. The result is a intuitive linear equation
    2.Linear regression is prone to over-fitting but it can be easily avoided using some dimensionality reduction techniques, regularization techniques and cross-validation
### Cons:
    1.In the real world, the data is rarely linearly separable. It assumes that there is a straight-line relationship between the dependent and independent variables, and in this case, straight line is expected to separate the two groups, which seems doubtful
    2.Linear regression is very sensitive to outliers. Outliers should be analyzed and removed before applying linear regression to the dataset
    3.Multicollinearity should be considered because it assumes that there is no relationship among independent variables

## 3. Decision Tree

In [21]:
m2 = DecisionTreeClassifier(max_depth=2)
m2.fit(scaled_X_train, y_train)
y_pred2 = m2.predict(scaled_X_test)

In [34]:
print("The accuracy rate of decision tree is {}".format(round(sum(y_pred2==y_test)/len(y_test),3)))

The accuracy rate of decision tree is 0.721


In [23]:
cm2 = confusion_matrix(y_test, y_pred2)
cm2 = cm2.astype('float') / cm2.sum()
print(cm2)

[[0.57534247 0.08675799]
 [0.19178082 0.14611872]]


### Pros:
    1. A decision tree does not require normalization or scaling and we can directly apply the dataset to model for classification
    2. A decision tree model is very intuitive and easy to explain to audience
### Cons:
    1. For a decision tree sometimes calculation can go far more complex compared to other algorithms. Training a decision tree is not a difficult thing, but the following proning and parameter adjusting matters. On the other hand, more complex decision tree algorithms are worked out and generally we cannot just use one basic tree for work
    2. A decision tree cannot predict continuous variables
    

## 4. Neural Network

In [28]:
m3 = Sequential()
m3.add(Dense(10, input_dim = 8, activation = 'relu'))
m3.add(Dense(10, activation = 'relu'))
m3.add(Dense(1, activation = 'sigmoid'))
m3.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics= ['accuracy'])
m3.fit(scaled_X_train, y_train,epochs = 100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc04aea0410>

In [33]:
print('The accuracy rate of neural network is {}'.format(round(m3.evaluate(scaled_X_test, y_test, verbose=0)[1],3)))

The accuracy rate of neural network is 0.735


In [30]:
y_pred3 = m3.predict(scaled_X_test) > 0.5
cm3 = confusion_matrix(y_test, y_pred3.reshape(-1))
cm3 = cm3.astype('float') / cm3.sum()
print(cm3)

[[0.55707763 0.10502283]
 [0.15981735 0.17808219]]


### Pros:
    1. Neural networks are good to model with nonlinear data with large number of inputs. We can use neural network not only for regression and classification, but also for image recongnition and NLP
    2. We can dig into the hidder characteristics of the variables to achieve a more pertinent result
    3. Neural networks maybe the ML technique developed most in the past decades. Nowadays, lots of sophisticated and innovative networks are invented and can be applied to various fields
### Cons:
    1. Neural networks are black boxes, meaning we cannot know much each independent variable is influencing the dependent variables
    2. Neural networks take lots of memories when calculating and computationally very expensive (although not in this case)

# 5. Summary:
    1. Regression achieves the highest score, but I have to say that this result may not be external valid since out dataset is relatively small
    2. Decision tree doesn't have a satisfying result because most of our variables are continuous
    3. Neural network also achieves high score after hundreds of iterations. Although NN and LR both have a acceptable result, I would expect NN achieve a higher accuracy when prediction when we have more dataset and deal with more sophisticated problems