Machine learning model with classifiers

In [18]:
# importing libraries

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report


In [19]:
# prepare our independent and and dependent variable
df = pd.read_csv("diabetes_data_clean.csv")

x = df.drop('class', axis=1)# class is removed(only independent variables remain)
y = df['class'] # only class is identified(dependent variable)
print(x)

     age  ismale  polyuria  polydipsia  sudden weight loss  weakness  \
0     40       1         0           1                   0         1   
1     58       1         0           0                   0         1   
2     41       1         1           0                   0         1   
3     45       1         0           0                   1         1   
4     60       1         1           1                   1         1   
..   ...     ...       ...         ...                 ...       ...   
515   39       0         1           1                   1         0   
516   48       0         1           1                   1         1   
517   58       0         1           1                   1         1   
518   32       0         0           0                   0         1   
519   42       1         0           0                   0         0   

     polyphagia  genital thrush  visual blurring  itching  irritability  \
0             0               0                0        1   

In [20]:
print(y)

0      1
1      1
2      1
3      1
4      1
      ..
515    1
516    1
517    1
518    0
519    0
Name: class, Length: 520, dtype: int64


In [21]:
#split data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, stratify=y) #we strstify by y to make sure the
# proportions are equal."""

In [22]:
#model training
#DummyClassifier to establish baseline - DummyClassifier randomly guess if someone has diabetes or not
dummy = DummyClassifier()
dummy.fit(x_train, y_train)
dummy_prediction = dummy.predict(x_test)

In [23]:
#access DummyClassifer model with confusion_matrix 
print(confusion_matrix(y_test, dummy_prediction))


[[ 0 40]
 [ 0 64]]


In [24]:
# access DummyClassifer model with classification report
print(classification_report(y_test, dummy_prediction))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Models:

In [16]:
# 1. LogisticRegression model
logr = LogisticRegression(max_iter=1000) # max_iter is set to limit the total number of iterations
logr.fit(x_train, y_train)
logr_pred = logr.predict(x_test)

In [27]:
confusion_matrix(y_test,logr_pred)

array([[15, 25],
       [25, 39]])

In [28]:
print(classification_report(y_test,logr_pred))

              precision    recall  f1-score   support

           0       0.38      0.38      0.38        40
           1       0.61      0.61      0.61        64

    accuracy                           0.52       104
   macro avg       0.49      0.49      0.49       104
weighted avg       0.52      0.52      0.52       104



In [29]:
# 2. Decision Tree model
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
tree_pred = tree.predict(x_test)

In [31]:
confusion_matrix(y_test, tree_pred)

array([[40,  0],
       [ 1, 63]])

In [32]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [33]:
#3. RandomForest model
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
forest_pred = forest.predict(x_test)

In [34]:
confusion_matrix(y_test, forest_pred)

array([[40,  0],
       [ 1, 63]])

In [35]:
print(classification_report(y_test, forest_pred)) 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In conclusion we can see that RandomForest offers the best accoracy

In [36]:
# we check the level of importance of each variable
forest.feature_importances_

array([0.09338975, 0.07945713, 0.24866174, 0.17616501, 0.04819108,
       0.01758629, 0.03235525, 0.01652693, 0.03201854, 0.02743591,
       0.04399075, 0.03444369, 0.06415913, 0.02792631, 0.04003186,
       0.01766061])

In [37]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [40]:
pd.DataFrame({'feature': x.columns,
                'importance': forest.feature_importances_}).sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.248662
3,polydipsia,0.176165
0,age,0.09339
1,ismale,0.079457
12,partial paresis,0.064159
4,sudden weight loss,0.048191
10,irritability,0.043991
14,alopecia,0.040032
11,delayed healing,0.034444
6,polyphagia,0.032355


Summary:
1. Trained a baseline model - DummyClassifier
2. Trained three different models - Logistic Regression, Decision Tree and Random Forest
3. Identified the important features in the best performing model