### Part III: Machine learning model training

In [2]:
# import libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [8]:
# prepare our independent and dependent variables
df = pd.read_csv("diabetes_data_clean.csv")
x = df.drop('class', axis=1)
y = df['class']

In [11]:
#  split data into train and test
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,
                                                  stratify = y)

In [12]:
# begin model training
#  start with dummyclassifer to establish baseline
dummy = DummyClassifier()
dummy.fit(x_train, y_train)
dummy_pred = dummy.predict(x_test)

In [13]:
# assess DummyClassifier model
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]])

In [15]:
# use classification report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
#  logisticregression
logr = LogisticRegression()
logr.fit(x_train, y_train)
logr_pred = logr.predict(x_test)

In [18]:
confusion_matrix(y_test, logr_pred)

array([[36,  4],
       [ 2, 62]])

In [19]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92        40
           1       0.94      0.97      0.95        64

    accuracy                           0.94       104
   macro avg       0.94      0.93      0.94       104
weighted avg       0.94      0.94      0.94       104



In [20]:
# DecisionTree
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)

In [21]:
confusion_matrix(y_test, tree_pred)

array([[40,  0],
       [ 4, 60]])

In [22]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        40
           1       1.00      0.94      0.97        64

    accuracy                           0.96       104
   macro avg       0.95      0.97      0.96       104
weighted avg       0.97      0.96      0.96       104



In [23]:
# RandomForest
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
forest_pred = forest.predict(x_test)

In [24]:
confusion_matrix(y_test, forest_pred)

array([[40,  0],
       [ 1, 63]])

In [25]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [26]:
forest.feature_importances_

array([0.10512345, 0.09908715, 0.19439235, 0.18533912, 0.06850162,
       0.02119875, 0.03784148, 0.02916979, 0.02959241, 0.02946119,
       0.03822619, 0.03383281, 0.04446499, 0.02658757, 0.04010038,
       0.01708073])

In [27]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [29]:
pd.DataFrame({'feature': x.columns,
              'importance': forest.feature_importances_}).sort_values('importance',
                                                                      ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.194392
3,polydipsia,0.185339
0,age,0.105123
1,ismale,0.099087
4,sudden weight loss,0.068502
12,partial paresis,0.044465
14,alopecia,0.0401
10,irritability,0.038226
6,polyphagia,0.037841
11,delayed healing,0.033833
