In [1]:
import data_outcomes as data
import data_models

# Dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import datetime
import numpy as np
from scipy.stats import zscore
import scipy.stats as stats

In [2]:
df = data.getDiabetesBehaviorDataframe()
display(df.head())

----> Retrieving information for Resources/diabetes_data.csv


Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,1.0,18.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0
8,3.0,0.0,0.0,1.0,32.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
10,12.0,0.0,1.0,1.0,24.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0
14,10.0,0.0,1.0,1.0,29.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [3]:
models = [
        #LogisticRegression(random_state=42),
        #KNeighborsClassifier(n_neighbors=27),
        GradientBoostingClassifier(random_state=42, n_estimators=200),
        #AdaBoostClassifier(random_state=42, n_estimators=55, learning_rate=1.5),
        #DecisionTreeClassifier(random_state=42, max_features=10),
        RandomForestClassifier(random_state=1, n_estimators=100, max_depth=15)
    ]

dict = data_models.model_selector(df, models)

------------ Running predictions for GradientBoostingClassifier(n_estimators=200, random_state=42)  --------------------
Testing all features
Testing dropping features
Returning all features
------------ Running predictions for RandomForestClassifier(class_weight='balanced', max_depth=15, random_state=1)  --------------------
Testing all features
Testing dropping features
Returning all features


In [4]:
for mdl in dict:
    met = dict[mdl]
    print(f'----------------------------------------------------------------------------------------------')
    print(f'-----Model {met['model']}-------')
    print(f'----------------------------------------------------------------------------------------------')
    print(f'Training Score: {met['train_score']}')
    print(f'Testing Score:  {met['test_score']}')
    print(f'Accuracy:  {met['test_metrics']['accuracy']}')
    print(f'Confusion Matrix:  {met['test_metrics']['confusion_matrix']}')
    print(f'Balanced Accuracy Score:  {met['test_metrics']['balanced_accuracy_score']}')
    print(f'Classification Report:')

    print( '                 precision                     recall                      f1-score                   support')
    print(f'           0.0   {met['test_metrics']['classification_report']['1']['precision']}  \
          {met['test_metrics']['classification_report']['1']['recall']} \
        {met['test_metrics']['classification_report']['1']['f1-score']} \
        {met['test_metrics']['classification_report']['1']['support']}')
    
    print(f'           0.1   {met['test_metrics']['classification_report']['0']['precision']}  \
          {met['test_metrics']['classification_report']['0']['recall']} \
        {met['test_metrics']['classification_report']['0']['f1-score']} \
        {met['test_metrics']['classification_report']['0']['support']}')
    
    print(f'       macro avg   {met['test_metrics']['classification_report']['macro avg']['precision']}  \
          {met['test_metrics']['classification_report']['macro avg']['recall']} \
        {met['test_metrics']['classification_report']['macro avg']['f1-score']} \
        {met['test_metrics']['classification_report']['macro avg']['support']}')
    
    print(f'    weighted avg   {met['test_metrics']['classification_report']['weighted avg']['precision']}  \
          {met['test_metrics']['classification_report']['weighted avg']['recall']} \
        {met['test_metrics']['classification_report']['weighted avg']['f1-score']} \
        {met['test_metrics']['classification_report']['weighted avg']['support']}')

----------------------------------------------------------------------------------------------
-----Model <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>-------
----------------------------------------------------------------------------------------------
Training Score: 0.7711625968251763
Testing Score:  0.7624257580493904
Accuracy:  0.7624257580493904
Confusion Matrix:  [[3604  999]
 [1281 3713]]
Balanced Accuracy Score:  0.7632299102177011
Classification Report:
                 precision                     recall                      f1-score                   support
           0.0   0.737768679631525            0.7829676298066478         0.7596964586846543         4603.0
           0.1   0.7879881154499151            0.7434921906287545         0.7650937564393159         4994.0
       macro avg   0.7628783975407201            0.7632299102177011         0.762395107561985         9597.0
    weighted avg   0.76390141511939            0.7624257580493904         0.7625050556