In [1]:
import pandas as pd
import numpy as np
import warnings   
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv("Diabetes_data.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35,0,33.6,0.627,50.0,1
1,1,85.0,66.0,29,0,26.6,0.351,31.0,0
2,8,183.0,64.0,0,0,23.3,0.672,32.0,1
3,1,89.0,66.0,23,94,28.1,0.167,21.0,0
4,0,137.0,40.0,35,168,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63.0,0
764,2,122.0,70.0,27,0,36.8,0.340,27.0,0
765,5,121.0,72.0,23,112,26.2,0.245,30.0,0
766,1,126.0,60.0,0,0,30.1,0.349,47.0,1


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35,0,33.6,0.627,50.0,1
1,1,85.0,66.0,29,0,26.6,0.351,31.0,0
2,8,183.0,64.0,0,0,23.3,0.672,32.0,1
3,1,89.0,66.0,23,94,28.1,0.167,21.0,0
4,0,137.0,40.0,35,168,43.1,2.288,33.0,1


In [4]:
data.shape

(768, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   766 non-null    float64
 2   BloodPressure             767 non-null    float64
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  767 non-null    float64
 7   Age                       767 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(5), int64(4)
memory usage: 54.1 KB


In [6]:
col_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'Age']


for col in col_names:
    
    print(data[col].value_counts())  

1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64
99.0     17
100.0    17
111.0    14
129.0    14
125.0    14
         ..
191.0     1
177.0     1
44.0      1
62.0      1
190.0     1
Name: Glucose, Length: 136, dtype: int64
70.0     57
74.0     52
78.0     45
68.0     45
72.0     44
64.0     43
76.0     39
80.0     39
60.0     37
0.0      35
62.0     34
66.0     30
82.0     30
88.0     25
84.0     23
90.0     22
86.0     21
58.0     21
50.0     13
56.0     12
52.0     11
54.0     11
75.0      8
92.0      8
65.0      7
85.0      6
94.0      6
48.0      5
96.0      4
44.0      4
100.0     3
106.0     3
98.0      3
110.0     3
55.0      2
108.0     2
104.0     2
46.0      2
30.0      2
122.0     1
95.0      1
102.0     1
61.0      1
24.0      1
38.0      1
40.0      1
114.0     1
Name: BloodPressure, dtype: int64
32.0    13
31.6    12
31.2 

In [8]:
data['Pregnancies'].value_counts()

70.0     57
74.0     52
78.0     45
68.0     45
72.0     44
64.0     43
76.0     39
80.0     39
60.0     37
0.0      35
62.0     34
66.0     30
82.0     30
88.0     25
84.0     23
90.0     22
86.0     21
58.0     21
50.0     13
56.0     12
52.0     11
54.0     11
75.0      8
92.0      8
65.0      7
85.0      6
94.0      6
48.0      5
96.0      4
44.0      4
100.0     3
106.0     3
98.0      3
110.0     3
55.0      2
108.0     2
104.0     2
46.0      2
30.0      2
122.0     1
95.0      1
102.0     1
61.0      1
24.0      1
38.0      1
40.0      1
114.0     1
Name: BloodPressure, dtype: int64

In [16]:
# check missing values in variables
data.isnull().sum()

Pregnancies                 0
Glucose                     2
BloodPressure               1
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    1
Age                         1
Outcome                     0
dtype: int64

In [17]:
# Remove rows with missing values
data.dropna(inplace=True)
# Or impute missing values with the mean
data.fillna(data.mean(), inplace=True)
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35,0,33.6,0.627,50.0,1
1,1,85.0,66.0,29,0,26.6,0.351,31.0,0
2,8,183.0,64.0,0,0,23.3,0.672,32.0,1
3,1,89.0,66.0,23,94,28.1,0.167,21.0,0
4,0,137.0,40.0,35,168,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48,180,32.9,0.171,63.0,0
764,2,122.0,70.0,27,0,36.8,0.340,27.0,0
765,5,121.0,72.0,23,112,26.2,0.245,30.0,0
766,1,126.0,60.0,0,0,30.1,0.349,47.0,1


#### Declare feature vector and target variable

In [18]:
X = data.drop(['Pregnancies'], axis=1)

y = data['Pregnancies']

In [19]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [20]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((511, 8), (252, 8))

In [21]:
# check data types in X_train

X_train.dtypes

Glucose                     float64
BloodPressure               float64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
Outcome                       int64
dtype: object

In [34]:
X_train.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
546,187.0,76.0,27,207,43.6,1.034,53.0,1
612,168.0,88.0,42,321,38.2,0.787,40.0,1
636,104.0,74.0,0,0,28.8,0.153,48.0,0
210,81.0,60.0,22,0,27.7,0.29,25.0,0
413,143.0,74.0,22,61,26.2,0.256,21.0,0


In [40]:
# import category encoders

import category_encoders as ce
# encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['Glucose', 'BloodPressure', 'BMI', 'Age'])

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

X_train.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
546,1,1,27,207,1,1.034,1,1
612,2,2,42,321,2,0.787,2,1
636,3,3,0,0,3,0.153,3,0
210,4,4,22,0,4,0.29,4,0
413,5,3,22,61,5,0.256,5,0


In [41]:
X_test.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
358,74.0,3.0,40,54,205.0,0.378,3,0
260,-1.0,23.0,15,130,30.0,0.299,17,0
754,87.0,20.0,32,0,111.0,0.443,8,1
194,81.0,16.0,20,0,204.0,0.136,27,0
334,12.0,4.0,18,58,-1.0,0.26,12,0


#### Decision Tree Classifier with criterion gini index

In [43]:
# import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
# instantiate the DecisionTreeClassifier model with criterion gini index

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=0)

Predict the Test set results with criterion gini index

In [45]:
y_pred_gini = clf_gini.predict(X_test)

Check accuracy score with criterion gini index

In [46]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

Model accuracy score with criterion gini index: 0.1310


Here, y_test are the true class labels and y_pred_gini are the predicted class labels in the test-set.

In [47]:
#Compare the train-set and test-set accuracy
#Now, I will compare the train-set and test-set accuracy to check for overfitting.

y_pred_train_gini = clf_gini.predict(X_train)

y_pred_train_gini

array([9, 9, 2, 2, 1, 1, 9, 9, 1, 9, 1, 1, 1, 9, 9, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 9, 6, 1, 1, 1, 6, 1, 1, 1,
       2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 9, 1, 1, 9, 2, 1, 4, 1, 1, 1, 9,
       6, 6, 1, 1, 1, 1, 6, 1, 1, 1, 4, 1, 9, 2, 0, 1, 1, 1, 1, 6, 1, 1,
       1, 1, 1, 1, 1, 1, 6, 2, 1, 2, 1, 0, 2, 6, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 6, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 6, 0, 1, 1, 1, 1, 1, 9, 1, 0, 1, 6, 1, 9, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 8, 6, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 4, 8, 1, 1, 1, 1, 1,
       2, 1, 0, 1, 2, 1, 6, 1, 0, 1, 0, 1, 1, 6, 0, 1, 1, 6, 2, 1, 1, 0,
       1, 1, 1, 6, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1,
       9, 1, 8, 1, 1, 1, 0, 1, 0, 0, 1, 9, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 6, 1, 1, 1, 1, 0, 1, 0,
       8, 1, 1, 8, 4, 1, 6, 0, 0, 1, 1, 8, 1, 1, 1,

In [48]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

Training-set accuracy score: 0.2622


In [49]:
#Check for overfitting and underfitting
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

Training set score: 0.2622
Test set score: 0.1310
