In [1]:

# Source of data : https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset
        
## Diabetes-prediction-and-risk-factors

from IPython.display import clear_output
clear_output()

In [2]:
import pandas as pd
import numpy as np
import warnings
from pandas_profiling import ProfileReport
from collections import Counter
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('/kaggle/input/diabetes-health-indicators-dataset/diabetes_012_health_indicators_BRFSS2015.csv')

profile = ProfileReport(df, title='Diabetes Health Indicators Dataset Profile Report')
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

Observations : 

All features are categorical except for : 

    Body Mass Index (BMI), Mental Health (MENTHLTH) and Physical Health (PHYSHLTH).
    
Based on correlation table, health care coverage (AnyHealthcare) is weak correlation to Diabetes. Will exclude. 

In [5]:
numerical_attribs = ['BMI', 'MentHlth', 'PhysHlth']

## Change all columns to category except for the following 
categorical_attribs = list(df.columns[~df.columns.isin(['BMI', 'MentHlth', 'AnyHealthcare','PhysHlth', 'Diabetes_012'])])
df[categorical_attribs] = df[categorical_attribs].astype('category')
## Change Diabetes_012 to integer
df.Diabetes_012 = df.Diabetes_012.astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Diabetes_012          253680 non-null  int64   
 1   HighBP                253680 non-null  category
 2   HighChol              253680 non-null  category
 3   CholCheck             253680 non-null  category
 4   BMI                   253680 non-null  float64 
 5   Smoker                253680 non-null  category
 6   Stroke                253680 non-null  category
 7   HeartDiseaseorAttack  253680 non-null  category
 8   PhysActivity          253680 non-null  category
 9   Fruits                253680 non-null  category
 10  Veggies               253680 non-null  category
 11  HvyAlcoholConsump     253680 non-null  category
 12  AnyHealthcare         253680 non-null  float64 
 13  NoDocbcCost           253680 non-null  category
 14  GenHlth               253680 non-nul

In [6]:
#Normalize continuous variables
for num in numerical_attribs: 
    df[num] = (df[num] - df[num].min()) / (df[num].max() - df[num].min()) 

label_map = {0:0, 1:1, 2:1}

df['Diabetes_012'] = df['Diabetes_012'].map(label_map)

y = df.Diabetes_012

attribs = df[numerical_attribs + categorical_attribs]

X = pd.get_dummies(attribs, drop_first=True)

#Split to test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


eval_set = [(X_test, y_test)]

# summarize the class distribution of the training dataset
counter = Counter(y_train)
print(counter)

Counter({0: 149592, 1: 27984})


Target class has an uneven distribution of observations (imbalanced data).

In [7]:
# transform the training dataset
oversample = SMOTE(random_state=42)
X_train, y_train = oversample.fit_resample(X_train, y_train)

# summarize the new class distribution of the training dataset
counter = Counter(y_train)
print(counter)

Counter({0: 149592, 1: 149592})


In [8]:
# Applying XGB Boost model

xgbc = XGBClassifier()
print(xgbc)


XGBClassifier( base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=400, n_jobs=1, nthread=None,
       random_state=0, reg_alpha=0.5,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=0.5, verbosity=1, eval_metric=['logloss'],use_label_encoder=False) 

xgbc.fit(X_train, y_train,eval_metric=['logloss'])
scores = cross_val_score(xgbc, X_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())
# Mean cross-validation score: 0.87
 
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
# K-fold CV average score: 0.90

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)
Mean cross-validation score: 0.87
K-fold CV average score: 0.90


In [9]:
print('Model name: XGBoost Classifier')
#print('Accuracy: ', '{}%'.format(round((accuracy_score(y_val, xgbc.predict(X_val)) * 100), 2)))
print('Accuracy: ', '{}%'.format(round((accuracy_score(y_test, xgbc.predict(X_test)) * 100), 2)))

# Accuracy:  84.68%


# predict the target on the test dataset
y_predict = xgbc.predict(X_test)
print('\nPrediction on test data', y_predict) 

RMSE = mean_squared_error(y_test, y_predict, squared=False)
print('\nRMSE on test dataset : %.4f' % RMSE)

Model name: XGBoost Classifier
Accuracy:  84.68%

Prediction on test data [1 1 0 ... 0 0 0]

RMSE on test dataset : 0.3915


In [10]:
#Classification Report
print(classification_report(y_test, xgbc.predict(X_test), target_names=['no_diabetes', 'prediabetes_or_diabetes']))

                         precision    recall  f1-score   support

            no_diabetes       0.88      0.95      0.91     64111
prediabetes_or_diabetes       0.53      0.27      0.36     11993

               accuracy                           0.85     76104
              macro avg       0.70      0.61      0.64     76104
           weighted avg       0.82      0.85      0.83     76104



In [11]:
#Feature Importances Analysis
table = pd.DataFrame(list(zip(X.columns,xgbc.feature_importances_)), 
                                      columns=['Feature', 'Importance (%)'])

table['Importance (%)'] = (table['Importance (%)']
                           .apply(lambda row: round((row * 100),2)))

def condense(row): return row.split('_')[0]

table['Feature'] = table['Feature'].apply(lambda row: condense(row))

table.groupby(['Feature'], as_index=False).sum().sort_values(by=['Importance (%)'], ascending=False)

Unnamed: 0,Feature,Importance (%)
0,Age,35.96
8,HighBP,10.31
1,BMI,10.3
6,GenHlth,9.55
11,Income,6.34
10,HvyAlcoholConsump,4.75
15,PhysHlth,4.46
2,CholCheck,3.93
4,Education,3.61
12,MentHlth,3.52
