In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv("diabetes.csv")
df.head()


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [4]:
# Drop weak features
df_cleaned = df.drop(columns=['HvyAlcoholConsump', 'Sex', 'MentHlth'])

X = df_cleaned.drop(columns=['Diabetes_012'])
y = df_cleaned['Diabetes_012']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
models = {
    "Random Forest": (RandomForestClassifier(random_state=42), {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None]
    }),
    "SVM": (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    }),
    "Logistic Regression": (LogisticRegression(max_iter=1000), {
        'C': [0.1, 1, 10],
        'penalty': ['l2']
    }),
    "Naive Bayes": (GaussianNB(), {}),
    "KNN": (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9]
    })
}

accuracies = {}
conf_matrices = {}

In [7]:
for name, (model, params) in models.items():
    print(f"\n🔍 Training {name}")
    
    if name in ['Naive Bayes']:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        best_model = model
    else:
        grid = GridSearchCV(model, params, cv=3, n_jobs=-1)
        data_train = X_train_scaled if name in ['SVM', 'Logistic Regression', 'KNN'] else X_train
        data_test = X_test_scaled if name in ['SVM', 'Logistic Regression', 'KNN'] else X_test
        grid.fit(data_train, y_train)
        preds = grid.predict(data_test)
        best_model = grid.best_estimator_
        print("  Best Parameters:", grid.best_params_)
    
    acc = accuracy_score(y_test, preds)
    print("  Accuracy:", acc)
    accuracies[name] = acc
    conf_matrices[name] = confusion_matrix(y_test, preds)


🔍 Training Random Forest
  Best Parameters: {'max_depth': 10, 'n_estimators': 100}
  Accuracy: 0.8490223904131189

🔍 Training SVM


KeyboardInterrupt: 