<a href="https://colab.research.google.com/github/its-snowy/Comparative-Analysis-of-Machine-Learning-Models-for-Diabetes-Prediction/blob/main/Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
data = data.drop(columns=[
    'CholCheck',
    'AnyHealthcare',
    'NoDocbcCost',
    'DiffWalk',
    'MentHlth',
    'PhysHlth',
    'Education',
    'Income'
])

X = data.drop('Diabetes_012', axis=1)
y = data['Diabetes_012']

In [None]:
unique_ages = data['Age'].unique()
print(unique_ages)


[ 9.  7. 11. 10.  8. 13.  4.  6.  2. 12.  5.  1.  3.]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
X

Unnamed: 0,HighBP,HighChol,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,Sex,Age
0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0,9.0
1,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,7.0
2,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,9.0
3,1.0,0.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,11.0
4,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,1.0,5.0
253676,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,11.0
253677,0.0,0.0,28.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0
253678,1.0,0.0,23.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,1.0,7.0


In [5]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': LinearSVC(max_iter=10000)
}

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f'Training {name}...')

    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='accuracy')

    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)

    print(f'{name} Cross-Validation Accuracy (mean): {cv_scores.mean():.4f}')
    print(f'{name} Accuracy on Test Set: {accuracy:.4f}')
    print(classification_report(y_test, y_pred))
    print('-----------------------------------')



Training Logistic Regression...
Logistic Regression Cross-Validation Accuracy (mean): 0.8463
Logistic Regression Accuracy on Test Set: 0.8454


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.86      0.98      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.52      0.17      0.25      7069

    accuracy                           0.85     50736
   macro avg       0.46      0.38      0.39     50736
weighted avg       0.80      0.85      0.81     50736

-----------------------------------
Training Decision Tree...
Decision Tree Cross-Validation Accuracy (mean): 0.8037
Decision Tree Accuracy on Test Set: 0.8056
              precision    recall  f1-score   support

         0.0       0.87      0.92      0.89     42741
         1.0       0.03      0.02      0.02       926
         2.0       0.34      0.24      0.28      7069

    accuracy                           0.81     50736
   macro avg       0.41      0.39      0.40     50736
weighted avg       0.78      0.81      0.79     50736

-----------------------------------
Training Random Forest...
Random Forest Cr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
import joblib
bestModel = max(models, key=lambda x: cross_val_score(models[x], X_train_scaled, y_train, cv=kf, scoring='accuracy').mean())
print(f'Best model: {bestModel}')

finalModel = models[bestModel]

joblib.dump(finalModel, 'best_diabetes_model.joblib')
joblib.dump(scaler, 'scaler.joblib')

Best model: Logistic Regression


['scaler.joblib']