In [2]:
from sklearn import (
    linear_model, 
    preprocessing,
    model_selection,
    metrics, 
    tree, 
    neighbors, 
    naive_bayes)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
data = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

# removing extraneous features to decrease complexity
data = data.drop(columns=[
    'AnyHealthcare',
    'NoDocbcCost',
    'GenHlth', 
    'MentHlth', 
    'PhysHlth', 
    'DiffWalk',
    'Education',
    'Income'
], axis=0)

data.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,Sex,Age
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,11.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,11.0


In [4]:
# specify target and features 
target = np.array(data['Diabetes_012'])
target_names = ['no diabetes', 'prediabetes', 'diabetes']

features_df = data.drop(columns='Diabetes_012', axis=0)
predictors = features_df.values
feature_names = features_df.columns


# Decision Tree Classification Model

In [5]:
def comparison_table(dfs, columns):
    meanScores = pd.concat(
        [df.mean() for df in dfs], 
        axis=1)
    meanScores.columns = [col for col in columns]

    stdScores = pd.concat(
        [df.std() for df in dfs], 
        axis=1)
    stdScores.columns = [col for col in columns]

    comparison = pd.concat([meanScores, stdScores], ignore_index=True)
    comparison.index = ['Mean', 'Stardard Dev.']
    return comparison

In [33]:
## decision tree 
def decision_tree(predictors, target, criterion, max_depth):
    # Creating Train and Test datasets
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        predictors, 
        target, 
        test_size = 0.33, 
        stratify=target, 
        random_state=3)

    dtree_estimator = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    dtree = dtree_estimator.fit(x_train,y_train)

    '''# Create the new tree diagram
    diagramName = f"dtree-{criterion}{f'-{max_depth}level' if max_depth else ''}"
    with open(f'{diagramName}.dot', 'w') as f:
        f = tree.export_graphviz(dtree, out_file=f, feature_names=feature_names, class_names=target_names)

    # os.system(f"dot -Tpdf {diagramName}.dot -o {diagramName}.png")'''
    
    # Predict Accuracy Score
    y_pred = dtree.predict(x_test)
    accuracy = {}
    accuracy['Params'] = f'{criterion}, {max_depth}'
    accuracy['Train acc'] = metrics.accuracy_score(y_true = y_train, y_pred=dtree.predict(x_train))
    accuracy["Test acc"] = metrics.accuracy_score(y_true = y_test, y_pred=y_pred)
    accuracy['Prec'] = str(metrics.precision_score(y_test, y_pred, average=None))

    crossval = model_selection.cross_val_score(dtree_estimator, predictors, target, cv = 10, scoring='accuracy')

    return accuracy, pd.DataFrame(crossval)


In [37]:
res = [] 
crossvals = []
for criterion in ['entropy', 'gini']:
    for i in [5, 8, 30, None]:
        dict, cross = decision_tree(predictors, target, criterion, i)
        res.append(dict)
        crossvals.append(cross)


res_df = pd.DataFrame(res)

cross_table = comparison_table(crossvals, res_df['Params'])

res_df['cross mean acc'] = list(cross_table.T['Mean'])
res_df['std'] = list(cross_table.T['Stardard Dev.'])

dtree_res_df = res_df
dtree_res_df

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Params,Train acc,Test acc,Prec,cross mean acc,std
0,"entropy, 5",0.845303,0.845249,[0.85744562 0. 0.52046205],0.844903,0.001525
1,"entropy, 8",0.846716,0.844998,[0.85232207 0. 0.53968254],0.845384,0.001443
2,"entropy, 30",0.896543,0.817464,[0.86141806 0.02086438 0.35249918],0.820392,0.003326
3,"entropy, None",0.896543,0.817906,[0.86152007 0.02098951 0.35412904],0.820396,0.003355
4,"gini, 5",0.845333,0.845165,[0.85773119 0. 0.5183871 ],0.844899,0.001599
5,"gini, 8",0.847098,0.844735,[0.85197183 0. 0.5364065 ],0.845396,0.001548
6,"gini, 30",0.896543,0.816926,[0.86138756 0.02058824 0.35047032],0.820088,0.003076
7,"gini, None",0.896543,0.816962,[0.86112845 0.02196193 0.34996728],0.819958,0.003021


# Logistic Regression Model 

In [6]:
## logistic regression model 
def logistic_regression(predictors, target, solver, penalty):
    # Creating Train and Test datasets
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        predictors, 
        target, 
        test_size = 0.33, 
        stratify=target, 
        random_state=3)

    logR_estimator = linear_model.LogisticRegression(solver=solver, penalty=penalty, max_iter=500)
    logR = logR_estimator.fit(x_train,y_train)
    
    # Predict Accuracy Score
    y_pred = logR.predict(x_test)
    accuracy = {}
    accuracy['Params'] = f'{solver}, {penalty}'
    accuracy['Train acc'] = metrics.accuracy_score(y_true = y_train, y_pred=logR.predict(x_train))
    accuracy["Test acc"] = metrics.accuracy_score(y_true = y_test, y_pred=y_pred)
    accuracy['Prec'] = str(metrics.precision_score(y_test, y_pred, average=None))

    crossval = model_selection.cross_val_score(logR_estimator, predictors, target, cv = 10, scoring='accuracy')

    return accuracy, pd.DataFrame(crossval)

In [7]:
res = [] 
crossvals = []
solvers = {
    'sag':['l2', None],
    'saga':['l2', None]
}
for solver in solvers.keys():
    for penalty in solvers[solver]:
        dict, cross =logistic_regression(predictors, target, solver, penalty)
        res.append(dict)
        crossvals.append(cross)

res_df = pd.DataFrame(res)

cross_table = comparison_table(crossvals, res_df['Params'])

res_df['cross mean acc'] = list(cross_table.T['Mean'])
res_df['std'] = list(cross_table.T['Stardard Dev.'])

logr_res = res_df
logr_res

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Params,Train acc,Test acc,Prec,cross mean acc,std
0,"sag, l2",0.844133,0.843576,[0.85416077 0. 0.50198728],0.843626,0.00501
1,"sag, None",0.844138,0.843576,[0.85416077 0. 0.50198728],0.843626,0.00501
2,"saga, l2",0.844133,0.843576,[0.85416077 0. 0.50198728],0.843626,0.00501
3,"saga, None",0.844133,0.843564,[0.85415897 0. 0.50178784],0.843626,0.00501


## types of solvers explained 
https://medium.com/@arnavr/scikit-learn-solvers-explained-780a17bc322d

Since this is a multiclass dataset, \lstinline{SKLearn} has various solver algorithms to use in the optimization problem for the logstic regression model. Those include Newton's method, the Stochastic average gradient (SAG), a variant of the SAG (SAGA), and finally the Limited-memory Broyden–Fletcher–Goldfarb–Shanno Algorithm. For this application, only SAG and SAGA were investigated due to their ability to deal with large, multiclass datasets. 

chosen: SAGA, None

# K-nearest Neighbor

In [30]:
## KNN
def KNN(predictors, target, k, weight, distance:str=None):
    # Creating Train and Test datasets
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        predictors, 
        target, 
        test_size = 0.33, 
        stratify=target, 
        random_state=3)

    if weight == 'distance':
        knn_estimator = neighbors.KNeighborsClassifier(n_neighbors=k, weights=weight, metric=distance)
    else: 
        knn_estimator = neighbors.KNeighborsClassifier(n_neighbors=k, weights=weight)
    
    knn = knn_estimator.fit(x_train,y_train)
    
    # Predict Accuracy Score
    y_pred = knn.predict(x_test)
    accuracy = {}
    accuracy['Params'] = '{}, {}{}'.format(
        k,
        weight, 
        f', {distance}' if distance else ''
    )
    accuracy['Train acc'] = metrics.accuracy_score(y_true = y_train, y_pred=knn.predict(x_train))
    accuracy["Test acc"] = metrics.accuracy_score(y_true = y_test, y_pred=y_pred)
    accuracy['Prec'] = str(metrics.precision_score(y_test, y_pred, average=None))

    crossval = model_selection.cross_val_score(
        knn_estimator, 
        predictors, 
        target, 
        cv = 10, 
        scoring='accuracy'
    )

    return accuracy, pd.DataFrame(crossval)

In [None]:
res = [] 
crossvals = []
kvals = [20, 25, 30]
for weight in ['uniform', 'distance']:
    if weight == 'uniform':
        for k in kvals:
            dict, cross = KNN(predictors, target, k, weight)
            res.append(dict)
            crossvals.append(cross)
    
    else: 
        for distance in ['euclidean', 'manhattan']:
            for k in kvals:
                dict, cross = KNN(predictors, target, k, weight, distance)
                res.append(dict)
                crossvals.append(cross)


res_df = pd.DataFrame(res)

cross_table = comparison_table(crossvals, res_df['Params'])

res_df['cross mean acc'] = list(cross_table.T['Mean'])
res_df['std'] = list(cross_table.T['Stardard Dev.'])

knn_res = res_df
knn_res

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    Params  Train acc  Test acc  \
0              20, uniform   0.849951  0.843266   
1              25, uniform   0.849192  0.844974   
2              30, uniform   0.848669  0.844771   
3  20, distance, euclidean   0.896532  0.829851   
4  25, distance, euclidean   0.896538  0.830652   
5  30, distance, euclidean   0.896543  0.830652   
6  20, distance, manhattan   0.896532  0.829899   
7  25, distance, manhattan   0.896538  0.830496   
8  30, distance, manhattan   0.896543  0.830592   

                                 Prec  cross mean acc       std  
0  [0.85418771 0.         0.49548843]        0.842593  0.001758  
1  [0.85388829 0.         0.52815364]        0.843307  0.001782  
2  [0.85294154 0.         0.528463  ]        0.843795  0.001340  
3  [0.85869372 0.02238806 0.3915547 ]        0.830546  0.002644  
4  [0.85866285 0.02238806 0.39690834]        0.830877  0.002553  
5  [0.85852644 0.02238806 0.39623055]        0.830980  0.002423  
6  [0.85866805 0.02238806 0

chosen -> k=25, eclidean 

# Naive Bayes

In [None]:
## naive bayes gaussian
def naiveBayes_results(predictors, target, estimator, name):
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        predictors, 
        target, 
        test_size = 0.33, 
        stratify=target, 
        random_state=3)

    model = estimator.fit(x_train, y_train)

    # Predict Accuracy Score
    y_pred = model.predict(x_test)
    accuracy = {}
    accuracy['Params'] = name
    accuracy['Train acc'] = metrics.accuracy_score(y_true = y_train, y_pred=model.predict(x_train))
    accuracy["Test acc"] = metrics.accuracy_score(y_true = y_test, y_pred=y_pred)
    accuracy['Prec'] = str(metrics.precision_score(y_test, y_pred, average=None))

    crossval = model_selection.cross_val_score(estimator, predictors, target, cv = 10, scoring='accuracy')

    return accuracy, pd.DataFrame(crossval)

In [None]:
res = []
cross_vals = [] 

# gaussian
gauss = naive_bayes.GaussianNB()
dict, cross = naiveBayes_results(predictors, target, gauss, 'Gaussian')
res.append(dict)
cross_vals.append(cross)

# bernoulli
bern = naive_bayes.BernoulliNB()
dict, cross = naiveBayes_results(predictors, target, bern, 'Bernoulli')
res.append(dict)
cross_vals.append(cross)

res_df = pd.DataFrame(res)

cross_table = comparison_table(cross_vals, res_df['Params'])

res_df['cross mean acc'] = list(cross_table.T['Mean'])
res_df['std'] = list(cross_table.T['Stardard Dev.'])
nb_res = res_df
nb_res

  _warn_prf(average, modifier, msg_start, len(result))


      Params  Train acc  Test acc                                Prec  \
0   Gaussian   0.777154  0.776348  [0.90220312 0.02521008 0.33714317]   
1  Bernoulli   0.824305  0.822600  [0.86692389 0.         0.37337058]   

   cross mean acc       std  
0        0.775682  0.009348  
1        0.824168  0.002867  
