### Logistic Regression appears to be the most accurate model for the Data
#### Now to Tune the Hyperparameters

In [6]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # Add StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Add classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression # Add LogisticRegression
from sklearn.ensemble import AdaBoostClassifier # Add AdaBoostClassifier
from xgboost import XGBClassifier # Add XGBClassifier
from GRANDE import GRANDE # Add GRANDE
import warnings # Add warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [2]:
# Import the data
df = pd.read_csv("datasets/diabetes_binary_health_indicators_BRFSS2015.csv")
diabetes_binary_2015 = df.copy()
print(diabetes_binary_2015.head())
diabetes_binary_2015.shape

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

(253680, 22)

In [3]:
# Create X and y variables
X = diabetes_binary_2015.drop(columns=["Diabetes_binary"])
y = diabetes_binary_2015["Diabetes_binary"]

In [4]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Logistic Regression Model Hypertuning

In [12]:
# Create a Logistic Regression model
# with max iterations = 400 and random_state=42
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)

In [13]:
# Validate the model by checking the model accuracy with model.score
print(f"Train Accuracy: {lr.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {lr.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8629980027331021
Test Accuracy: 0.865121412803532


In [14]:
target_names = ["negative", "positive"]

In [15]:
## Train a model without tuning
untuned_y_pred = lr.predict(X_test_scaled)
print(classification_report(y_test, untuned_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.98      0.93     54657
    positive       0.54      0.16      0.25      8763

    accuracy                           0.87     63420
   macro avg       0.71      0.57      0.59     63420
weighted avg       0.83      0.87      0.83     63420



In [16]:
# Tuning the model for iterations parameter
# iterations = [100, 200, 300, 400, 500]
# for i in iterations:
#     lr = LogisticRegression(max_iter=i, random_state=42)
#     lr.fit(X_train_scaled, y_train)
#     print(f"Iteration = {i}")
#     print(f"Train Accuracy: {lr.score(X_train_scaled, y_train)}")
#     print(f"Test Accuracy: {lr.score(X_test_scaled, y_test)}")

In [17]:
param_grid = {
    'max_iter': [100, 200, 300, 400, 500],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
    'C': np.arange(1, 500)
}
param_grid

{'max_iter': [100, 200, 300, 400, 500],
 'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
 'C': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 1

In [18]:
# Create the randomized search estimator
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(lr, param_grid, random_state=42, verbose=3)

In [19]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...C=364, max_iter=300, solver=sag;, score=0.863 total time=   1.8s
[CV 2/5] END ...C=364, max_iter=300, solver=sag;, score=0.863 total time=   1.5s
[CV 3/5] END ...C=364, max_iter=300, solver=sag;, score=0.863 total time=   1.5s
[CV 4/5] END ...C=364, max_iter=300, solver=sag;, score=0.864 total time=   1.6s
[CV 5/5] END ...C=364, max_iter=300, solver=sag;, score=0.863 total time=   1.8s
[CV 1/5] END ..C=44, max_iter=100, solver=lbfgs;, score=0.863 total time=   0.1s
[CV 2/5] END ..C=44, max_iter=100, solver=lbfgs;, score=0.863 total time=   0.1s
[CV 3/5] END ..C=44, max_iter=100, solver=lbfgs;, score=0.863 total time=   0.1s
[CV 4/5] END ..C=44, max_iter=100, solver=lbfgs;, score=0.864 total time=   0.1s
[CV 5/5] END ..C=44, max_iter=100, solver=lbfgs;, score=0.863 total time=   0.1s
[CV 1/5] END ...C=270, max_iter=300, solver=sag;, score=0.863 total time=   1.8s
[CV 2/5] END ...C=270, max_iter=300, solver=sag;

In [18]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'solver': 'saga', 'max_iter': 300, 'C': 260}


In [19]:
# Make predictions with the hypertuned model
random_tuned_pred = random_clf.predict(X_test_scaled)

In [20]:
# Calculate the classification report
print(classification_report(y_test, random_tuned_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.98      0.93     54657
    positive       0.54      0.16      0.25      8763

    accuracy                           0.87     63420
   macro avg       0.71      0.57      0.59     63420
weighted avg       0.83      0.87      0.83     63420



In [21]:
random_clf.best_estimator_

In [22]:
print(f"Train Accuracy: {random_clf.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {random_clf.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8630085146641439
Test Accuracy: 0.8650898770104068


In [23]:
lr2 = LogisticRegression(C=260, max_iter=300, random_state=42, solver='saga')
lr2.fit(X_train_scaled, y_train)
print(f"Train Accuracy: {lr2.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {lr2.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8630085146641439
Test Accuracy: 0.8650898770104068


Train Accuracy: 0.8630085146641439
Test Accuracy: 0.8650898770104068

In [24]:
lr3 = LogisticRegression(penalty='l2', solver='sag', max_iter=10, random_state=42)
lr3.fit(X_train_scaled, y_train)
print(f"Train Accuracy: {lr3.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {lr3.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8627404604225797
Test Accuracy: 0.865042573320719


#### The identical accuracy tells is that the Model has reached a global point of convergence.

## GRANDE model

In [27]:
#  Try GRANDE model to attempt an accuracy above 86%
# dataset = openml.datasets.get_dataset(40536)
# X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
# categorical_feature_indices = [idx for idx, idx_bool in enumerate(categorical_indicator) if idx_bool]

# X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

params = {
        'depth': 5, # tree depth
        'n_estimators': 2048, # number of estimators / trees

        'learning_rate_weights': 0.005, # learning rate for leaf weights
        'learning_rate_index': 0.01, # learning rate for split indices
        'learning_rate_values': 0.01, # learning rate for split values
        'learning_rate_leaf': 0.01, # learning rate for leafs (logits)

        'optimizer': 'adam', # optimizer
        'cosine_decay_steps': 0, # decay steps for lr schedule (CosineDecayRestarts)

        'loss': 'crossentropy', # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression)
        'focal_loss': False, # use focal loss {True, False}
        'temperature': 0.0, # temperature for stochastic re-weighted GD (0.0, 1.0)

        'from_logits': True, # use logits for weighting {True, False}
        'use_class_weights': True, # use class weights for training {True, False}

        'dropout': 0.0, # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training)

        'selected_variables': 0.8, # feature subset percentage (0.0, 1.0)
        'data_subset_fraction': 1.0, # data subset percentage (0.0, 1.0)
}

args = {
    'epochs': 1_000, # number of epochs for training
    'early_stopping_epochs': 25, # patience for early stopping (best weights are restored)
    'batch_size': 64,  # batch size for training

    'cat_idx': [], # put list of categorical indices
    'objective': 'binary', # objective / task {'binary', 'classification', 'regression'}
    
    'random_seed': 42,
    'verbose': 1,       
}

### This Panel Takes Awhile

In [28]:
model_grande = GRANDE(params=params, args=args)

model_grande.fit(X_train=X_train_scaled,
          y_train=y_train.values,
          X_val=X_test_scaled,
          y_val=y_test.values
          )

preds_grande = model_grande.predict(X_test_scaled)

Epoch 1/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2182s[0m 733ms/step - loss: 0.3760 - val_loss: 0.3105
Epoch 2/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2192s[0m 737ms/step - loss: 0.3207 - val_loss: 0.3097
Epoch 3/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2180s[0m 733ms/step - loss: 0.3198 - val_loss: 0.3094
Epoch 4/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2182s[0m 734ms/step - loss: 0.3195 - val_loss: 0.3092
Epoch 5/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2195s[0m 738ms/step - loss: 0.3191 - val_loss: 0.3091
Epoch 6/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2198s[0m 739ms/step - loss: 0.3188 - val_loss: 0.3091
Epoch 7/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2215s[0m 745ms/step - loss: 0.3186 - val_loss: 0.3090
Epoch 8/1000
[1m2973/2973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2215s[0m 745ms/step - 

### Unable to get this to run

In [31]:
accuracy = sklearn.metrics.accuracy_score(y_test, np.round(preds_grande[:,1]))
f1_score = sklearn.metrics.f1_score(y_test, np.round(preds_grande[:,1]), average='macro')
roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_grande[:,1], average='macro')
    
print('Accuracy:', accuracy)
print('F1 Score:', f1_score)
print('ROC AUC:', roc_auc)

NameError: name 'sklearn' is not defined

In [32]:
print(f"Train Accuracy: {model_grande.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {model_grande.score(X_test_scaled, y_test)}")

AttributeError: 'GRANDE' object has no attribute 'score'

### AdaBoostClassifier Model Hypertuning

In [31]:
# Create untuned AdaBoost model and show accuracy
adaboost = AdaBoostClassifier(random_state=42)
adaboost.fit(X_train_scaled, y_train)

print(f"Train Accuracy: {adaboost.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {adaboost.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8646956795963419
Test Accuracy: 0.8663986124251025


In [36]:
## Train a model without tuning
untuned_y_pred = adaboost.predict(X_test_scaled)
print(classification_report(y_test, untuned_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.97      0.93     54657
    positive       0.55      0.20      0.29      8763

    accuracy                           0.87     63420
   macro avg       0.71      0.58      0.61     63420
weighted avg       0.84      0.87      0.84     63420



In [32]:
#  Hypertune AdaBoost model
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'learning_rate': np.arange(1, 500)
}
param_grid

{'n_estimators': [50, 100, 200, 300, 400, 500],
 'learning_rate': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 1

In [33]:
random_clf = RandomizedSearchCV(adaboost, param_grid, random_state=42, verbose=3)

In [34]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END learning_rate=144, n_estimators=200;, score=0.360 total time=   0.2s
[CV 2/5] END learning_rate=144, n_estimators=200;, score=0.355 total time=   0.2s
[CV 3/5] END learning_rate=144, n_estimators=200;, score=0.359 total time=   0.2s
[CV 4/5] END learning_rate=144, n_estimators=200;, score=0.356 total time=   0.2s
[CV 5/5] END learning_rate=144, n_estimators=200;, score=0.361 total time=   0.2s
[CV 1/5] END learning_rate=216, n_estimators=400;, score=0.360 total time=   0.1s
[CV 2/5] END learning_rate=216, n_estimators=400;, score=0.355 total time=   0.1s
[CV 3/5] END learning_rate=216, n_estimators=400;, score=0.359 total time=   0.1s
[CV 4/5] END learning_rate=216, n_estimators=400;, score=0.356 total time=   0.1s
[CV 5/5] END learning_rate=216, n_estimators=400;, score=0.361 total time=   0.1s
[CV 1/5] END learning_rate=189, n_estimators=200;, score=0.360 total time=   0.1s
[CV 2/5] END learning_rate=189, n_est

In [24]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'n_estimators': 50, 'learning_rate': 56}


In [26]:
# Make predictions with the hypertuned model
random_tuned_pred = random_clf.predict(X_test_scaled)

In [27]:
# Calculate the classification report
print(classification_report(y_test, random_tuned_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.86      1.00      0.93     54657
    positive       0.00      0.00      0.00      8763

    accuracy                           0.86     63420
   macro avg       0.43      0.50      0.46     63420
weighted avg       0.74      0.86      0.80     63420



In [30]:
print(f"Train Accuracy: {random_clf.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {random_clf.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8602806685588142
Test Accuracy: 0.8618259224219489


Untuned AdaBoost Model:

Train Accuracy: 0.8646956795963419

Test Accuracy: 0.8663986124251025

### XGBoost Model Hypertuning

In [37]:
#  Try XG Boost model to attempt an accuracy above 86%
xg = XGBClassifier(random_state=42)
xg.fit(X_train_scaled, y_train)
print(f"Train Accuracy: {xg.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {xg.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8764480185009986
Test Accuracy: 0.8657205928729107


In [46]:
## Train a model without tuning
untuned_y_pred = xg.predict(X_test_scaled)
print(classification_report(y_test, untuned_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.98      0.93     54657
    positive       0.55      0.17      0.26      8763

    accuracy                           0.87     63420
   macro avg       0.71      0.57      0.59     63420
weighted avg       0.83      0.87      0.83     63420



In [47]:
# Hypertune the XGBoost model
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': np.arange(1, 5),
    'max_leaves': np.arange(1, 5),
    'min_child_weight': np.arange(1, 5),
    'subsample': np.arange(0.1, 1, 0.1),
}
param_grid

{'n_estimators': [50, 100, 200, 300, 400, 500],
 'max_depth': array([1, 2, 3, 4]),
 'max_leaves': array([1, 2, 3, 4]),
 'min_child_weight': array([1, 2, 3, 4]),
 'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])}

In [48]:
random_clf = RandomizedSearchCV(xg, param_grid, random_state=42, verbose=3)

In [49]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=4, max_leaves=3, min_child_weight=3, n_estimators=400, subsample=0.7000000000000001;, score=0.866 total time=   2.4s
[CV 2/5] END max_depth=4, max_leaves=3, min_child_weight=3, n_estimators=400, subsample=0.7000000000000001;, score=0.867 total time=   2.0s
[CV 3/5] END max_depth=4, max_leaves=3, min_child_weight=3, n_estimators=400, subsample=0.7000000000000001;, score=0.867 total time=   2.0s
[CV 4/5] END max_depth=4, max_leaves=3, min_child_weight=3, n_estimators=400, subsample=0.7000000000000001;, score=0.867 total time=   2.0s
[CV 5/5] END max_depth=4, max_leaves=3, min_child_weight=3, n_estimators=400, subsample=0.7000000000000001;, score=0.866 total time=   2.0s
[CV 1/5] END max_depth=1, max_leaves=4, min_child_weight=4, n_estimators=500, subsample=0.6;, score=0.866 total time=   2.4s
[CV 2/5] END max_depth=1, max_leaves=4, min_child_weight=4, n_estimators=500, subsample=0.6;, score=0.865 total ti

In [42]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'n_estimators': 300, 'max_leaves': 4, 'max_depth': 3}


In [43]:
# Make predictions with the hypertuned model
random_tuned_pred = random_clf.predict(X_test_scaled)

In [44]:
# Calculate the classification report
print(classification_report(y_test, random_tuned_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.98      0.93     54657
    positive       0.57      0.17      0.26      8763

    accuracy                           0.87     63420
   macro avg       0.73      0.57      0.59     63420
weighted avg       0.84      0.87      0.84     63420



In [50]:
confusion_matrix(y_test, random_tuned_pred)

array([[53557,  1100],
       [ 7286,  1477]], dtype=int64)

In [45]:
print(f"Train Accuracy: {random_clf.score(X_train_scaled, y_train)}")
print(f"Test Accuracy: {random_clf.score(X_test_scaled, y_test)}")

Train Accuracy: 0.8676127404604226
Test Accuracy: 0.8677704194260486


Train Accuracy: 0.8764480185009986

Test Accuracy: 0.8657205928729107