In [2]:
# import all libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix,classification_report

In [3]:
# prepare our independent and dependent variable
df=pd.read_csv("diabetes_data_clean.csv")
X = df.drop('class',axis=1)
y = df['class']


In [4]:
X

Unnamed: 0,age,ismale,polyuria,polydipsia,sudden weight loss,weakness,polyphagia,genital thrush,visual blurring,itching,irritability,delayed healing,partial paresis,muscle stiffness,alopecia,obesity
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0
516,48,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0
517,58,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1
518,32,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0


In [28]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y)

In [29]:
# begin our model training
# start with DummyClassifier to establish baseline
dummy=DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [30]:
# assess DummyClassifier model
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

In [32]:
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# start with LogistricRegression
logr=LogisticRegression(max_iter=10000)
logr.fit(X_train,y_train)
logr_pred = logr.predict(X_test)

In [35]:
confusion_matrix(y_test, logr_pred)

array([[40,  0],
       [ 6, 58]], dtype=int64)

In [36]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93        40
           1       1.00      0.91      0.95        64

    accuracy                           0.94       104
   macro avg       0.93      0.95      0.94       104
weighted avg       0.95      0.94      0.94       104



In [37]:
# try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
tree_pred= tree.predict(X_test)

In [38]:
confusion_matrix(y_test, tree_pred)

array([[39,  1],
       [ 5, 59]], dtype=int64)

In [39]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93        40
           1       0.98      0.92      0.95        64

    accuracy                           0.94       104
   macro avg       0.93      0.95      0.94       104
weighted avg       0.95      0.94      0.94       104



In [65]:
# try RandomForest
forest = RandomForestClassifier()
forest.fit(X_train,y_train)
forest_pred = forest.predict(X_test)

In [66]:
confusion_matrix(y_test, forest_pred)

array([[40,  0],
       [ 0, 64]], dtype=int64)

In [67]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        64

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104



In [68]:
forest.feature_importances_

array([0.09562589, 0.08886217, 0.23888169, 0.1750628 , 0.06503236,
       0.02165261, 0.03411929, 0.01916381, 0.0354066 , 0.02443043,
       0.03958433, 0.02890795, 0.05253721, 0.02745747, 0.03668883,
       0.01658656])

In [69]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [70]:
pd.DataFrame({'feature':X.columns,
              'importance': forest.feature_importances_}).sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.238882
3,polydipsia,0.175063
0,age,0.095626
1,ismale,0.088862
4,sudden weight loss,0.065032
12,partial paresis,0.052537
10,irritability,0.039584
14,alopecia,0.036689
8,visual blurring,0.035407
6,polyphagia,0.034119


In [None]:
#+++summary:
#    1. trained baseline model
#    2. trained three different models - logostic regression, decision tree, random forest
#    3. identified important feature in the best performing model