253,680 survey responses from cleaned BRFSS 2015 + balanced dataset

Input:
<table border="1" cellspacing="0" cellpadding="5">
  <tr>
    <th>Column</th>
    <th>Type / Range</th>
    <th>Description</th>
  </tr>
  <tr><td>HighBP</td><td>int [0,1]</td><td>Have high blood pressure</td></tr>
  <tr><td>HighChol</td><td>int [0,1]</td><td>Have high cholesterol</td></tr>
  <tr><td>CholCheck</td><td>int [0,1]</td><td>Cholesterol checked in the past 5 years</td></tr>
  <tr><td>BMI</td><td>int [0,1]</td><td>Body mass index</td></tr>
  <tr><td>Smoker</td><td>int [0,1]</td><td>Have you smoked at least 100 cigarettes in your entire life? (5 packs = 100)</td></tr>
  <tr><td>Stroke</td><td>int [0,1]</td><td>You had a stroke</td></tr>
  <tr><td>HeartDiseaseorAttack</td><td>int [0,1]</td><td>Coronary heart disease (CHD) or myocardial infarction (MI)</td></tr>
  <tr><td>PhysActivity</td><td>int [0,1]</td><td>Physical activity in past 30 days</td></tr>
  <tr><td>Fruits</td><td>int [0,1]</td><td>Consume at least 1 fruit per day</td></tr>
  <tr><td>Veggies</td><td>int [0,1]</td><td>Consume at least 1 vegetable per day</td></tr>
  <tr><td>HvyAlcoholConsump</td><td>int [0,1]</td><td>Men: &gt;14 drinks/week | Women: &gt;7 drinks/week</td></tr>
  <tr><td>AnyHealthCare</td><td>int [0,1]</td><td>Have any kind of health care coverage (insurance, HMO, etc.)</td></tr>
  <tr><td>NoDocbcCost</td><td>int [0,1]</td><td>Couldn't see a doctor in the past 12 months because of cost</td></tr>
  <tr><td>GenHlth</td><td>int [1-5]</td><td>General health (1=Excellent, 2=Very good, 3=Good, 4=Fair, 5=Poor)</td></tr>
  <tr><td>DiffWalk</td><td>int [0,1]</td><td>Serious difficulty walking or climbing stairs</td></tr>
  <tr><td>Sex</td><td>int [0,1]</td><td>Male or female</td></tr>
  <tr><td>Age</td><td>int [1-13]</td><td>13-level age category</td></tr>
</table>

Output:
<table border="1" cellspacing="0" cellpadding="5">
  <tr>
    <th>Column</th>
    <th>Type / Range</th>
    <th>Description</th>
  </tr>
  <tr><td>hasDiabetes</td><td>int [0,1,2]</td><td>target (0 = no diabetes, 1 = prediabetes, 2 = diabetes)</td></tr>
</table>

Age category:
<table border="1" cellspacing="0" cellpadding="5">
    <tr><th>Value</th><th>Age Range</th></tr>
    <tr><td>1</td><td>18-24</td></tr>       
    <tr><td>2</td><td>25-29</td></tr>
    <tr><td>3</td><td>30-34</td></tr>
    <tr><td>4</td><td>35-39</td></tr>
    <tr><td>5</td><td>40-44</td></tr>
    <tr><td>6</td><td>45-49</td></tr>
    <tr><td>7</td><td>50-54</td></tr>
    <tr><td>8</td><td>55-64</td></tr>
    <tr><td>9</td><td>60-64</td></tr>
    <tr><td>10</td><td>65-69</td></tr>
    <tr><td>11</td><td>70-74</td></tr>
    <tr><td>12</td><td>75-79</td></tr>
    <tr><td>13</td><td>80+</td></tr>
</table>

In [48]:
import pandas as pd
from joblib import dump
from datetime import datetime as dt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [52]:
df = pd.read_csv('../dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
df.head(5)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [53]:
input, output = [
    'HighBP', 
    'HighChol', 
    'CholCheck', 
    'BMI', 
    'Smoker',
    'Stroke', 
    'HeartDiseaseorAttack', 
    'PhysActivity', 
    'Fruits', 
    'Veggies',
    'HvyAlcoholConsump', 
    'AnyHealthcare', 
    'NoDocbcCost', 
    'GenHlth',
    # 'MentHlth',
    # 'PhysHlth',
    'DiffWalk',
    'Sex',
    'Age',
    # 'Education',
    # 'Income',
], [
    'Diabetes_binary'
]

X, y = df[input], df[output]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [57]:
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'model__C': [0.01, 0.1, 1, 10],
            'model__solver': ['lbfgs', 'liblinear']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [5, 10, 20, None]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf']
        }
    }
}

In [67]:
results = []
grids = {}

for name, cfg in models.items():
    pl = Pipeline([
        ('scaler', StandardScaler()),
        ('model', cfg['model'])
    ])
    
    grid = GridSearchCV(pl, cfg['params'], cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)
    grids[name] = grid
    
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred) * 100
    best_cv = grid.best_score_ * 100

    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'Best CV Score (%)': round(best_cv, 2),
        'Test Accuracy (%)': round(acc, 2)
    })
    
    print(f"\n{name}")
    print("Best params:", grid.best_params_)
    print(f"Accuracy: {acc:.2f}%")
    
    # Get classification report as dict
    report = classification_report(y_test, y_pred, output_dict=True)
    
    print("class | precision | recall | f1-score")
    for label, metrics in report.items():
        if isinstance(metrics, dict):
            p = metrics['precision'] * 100
            r = metrics['recall'] * 100
            f = metrics['f1-score'] * 100
            print(f"{label} | {p:.2f}% | {r:.2f}% | {f:.2f}%")

results_df = pd.DataFrame(results)
print("\nSummary")
print(results_df)

  y = column_or_1d(y, warn=True)



LogisticRegression
Best params: {'model__C': 0.1, 'model__solver': 'liblinear'}
Accuracy: 74.74%
class | precision | recall | f1-score
0.0 | 76.40% | 72.74% | 74.53%
1.0 | 73.18% | 76.80% | 74.94%
macro avg | 74.79% | 74.77% | 74.74%
weighted avg | 74.82% | 74.74% | 74.73%


  return fit_method(estimator, *args, **kwargs)



RandomForest
Best params: {'model__max_depth': 10, 'model__n_estimators': 200}
Accuracy: 75.05%
class | precision | recall | f1-score
0.0 | 78.26% | 70.46% | 74.16%
1.0 | 72.34% | 79.79% | 75.88%
macro avg | 75.30% | 75.12% | 75.02%
weighted avg | 75.35% | 75.05% | 75.01%


  y = column_or_1d(y, warn=True)



SVM
Best params: {'model__C': 0.1, 'model__kernel': 'rbf'}
Accuracy: 74.74%
class | precision | recall | f1-score
0.0 | 79.07% | 68.37% | 73.34%
1.0 | 71.34% | 81.31% | 76.00%
macro avg | 75.21% | 74.84% | 74.67%
weighted avg | 75.27% | 74.74% | 74.65%

Summary
                Model                                        Best Params  \
0  LogisticRegression    {'model__C': 0.1, 'model__solver': 'liblinear'}   
1        RandomForest  {'model__max_depth': 10, 'model__n_estimators'...   
2                 SVM          {'model__C': 0.1, 'model__kernel': 'rbf'}   

   Best CV Score (%)  Test Accuracy (%)  
0              74.71              74.74  
1              75.07              75.05  
2              74.93              74.74  


In [68]:
best_model_info = max(results, key=lambda x: x['Test Accuracy (%)'])
best_model_name = best_model_info['Model']
filename = f"trainedModels/diabetes_indicator_{best_model_name}_" + dt.now().strftime('%d-%m-%Y_%H-%M-%S') + ".pkl"

joblib.dump(grids[best_model_name].best_estimator_, filename)
print(f"The model '{best_model_name}' was saved as {filename}")

The model 'RandomForest' was saved as diabete_indicator_RandomForest_01-11-2025_20-18-05.pkl
