In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv("C:\\Users\\KANISHKA SHARMA\\Downloads\\archive (3)\\diabetes.csv")
zero_cols=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]

df[zero_cols]=df[zero_cols].replace(0,np.nan)
df[zero_cols]=df[zero_cols].fillna(df[zero_cols].median())

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df[["Glucose_scaled","BMI_Scaled"]]=scaler.fit_transform(df[["Glucose","BMI"]])
df["Metabolic_risk"]=(0.6*df["Glucose"]+0.4*df["BMI"])
df.drop(["Glucose_scaled", "BMI_Scaled"], axis=1, inplace=True)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)






In [2]:
#Using Logistic Regression as base model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score,recall_score,precision_score,roc_auc_score)
log_clf=LogisticRegression(max_iter=1000,class_weight="balanced")#In this place we used class_weight = "balanced", as in this way the wieght of minoity class is increased,leading to High recall for the positive outcome
log_clf.fit(X_train_scaled,y_train)
y_pred_log = log_clf.predict(X_test_scaled)
y_prob_log = log_clf.predict_proba(X_test_scaled)[:, 1]


In [3]:
#now let us review scores for our base model
print("Accuracy : ",accuracy_score(y_test,y_pred_log))
print("Precision : ",precision_score(y_test,y_pred_log))
print("Recall ",recall_score(y_test,y_pred_log))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_log))

Accuracy :  0.7337662337662337
Precision :  0.6031746031746031
Recall  0.7037037037037037
ROC-AUC: 0.812037037037037


In [4]:
#now our svm model
svm_clf=SVC(kernel="rbf",probability=True,class_weight="balanced")
svm_clf.fit(X_train_scaled,y_train)
y_pred_svm=svm_clf.predict(X_test_scaled)
y_prob_svm=svm_clf.predict_proba(X_test_scaled)[:,1]


In [5]:
print("Accuracy : ",accuracy_score(y_test,y_pred_svm))
print("Precision : ",precision_score(y_test,y_pred_svm))
print("Recall ",recall_score(y_test,y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_svm))

Accuracy :  0.7402597402597403
Precision :  0.6060606060606061
Recall  0.7407407407407407
ROC-AUC: 0.8131481481481481


In [6]:
#now let us use random forest for this dataset
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier(n_estimators=200,max_depth=5,class_weight="balanced",random_state=42)
rf_clf.fit(X_train,y_train)

y_pred_rf=rf_clf.predict(X_test)
y_prob_rf=rf_clf.predict_proba(X_test)[:,1]

In [7]:
print("Accuracy : ",accuracy_score(y_test,y_pred_rf))
print("Precision : ",precision_score(y_test,y_pred_rf))
print("Recall ",recall_score(y_test,y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

Accuracy :  0.7727272727272727
Precision :  0.6461538461538462
Recall  0.7777777777777778
ROC-AUC: 0.8203703703703703


In [8]:


importances = pd.Series(
    rf_clf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importances.head(5)


Metabolic_risk              0.279523
Glucose                     0.235981
BMI                         0.132728
Age                         0.097607
DiabetesPedigreeFunction    0.073704
dtype: float64

In [9]:
from sklearn.ensemble import VotingClassifier
voting_clf=VotingClassifier(estimators=[("svc",svm_clf),("lr",log_clf),("rf",rf_clf)],voting="soft"
                           )
voting_clf.fit(X_train, y_train)
y_pred_voting=voting_clf.predict(X_test)
y_prob_voting=voting_clf.predict_proba(X_test)[:,1]


In [10]:
print("Accuracy : ",accuracy_score(y_test,y_pred_voting))
print("Precision : ",precision_score(y_test,y_pred_voting))
print("Recall ",recall_score(y_test,y_pred_voting))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_voting))

Accuracy :  0.7337662337662337
Precision :  0.6140350877192983
Recall  0.6481481481481481
ROC-AUC: 0.8077777777777778


In [11]:
#Although a soft voting ensemble was constructed, it resulted in reduced recall compared to Random Forest alone. Therefore, Random Forest was retained as the final model due to its superior ability to minimize false negatives.