In [1]:
#Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



In [2]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [3]:
#Load the Dataset
df = pd.read_csv("diabetes.csv")
#Display the first few rows of the dataset
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#Understand the Dataset
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
#Handle Missing / Invalid Values
# Check for zeros in columns where they are not valid
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in columns_with_zeros:
    df[column].replace(0, np.nan, inplace=True)
# Impute missing values with the mean of the column
for column in columns_with_zeros:
    df[column].fillna(df[column].median(), inplace=True)
#Verify no missing values remain
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].replace(0, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [7]:
#Separate Features (X) and Target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [8]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


MODEL TRAINING

In [9]:
#logistic Regression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

logreg_pred = logreg.predict(X_test_scaled)
logreg_prob = logreg.predict_proba(X_test_scaled)[:, 1]

In [10]:
#svm Classifier
svm_clf = SVC(probability=True)
svm_clf.fit(X_train_scaled, y_train)
svm_pred = svm_clf.predict(X_test_scaled)
svm_prob = svm_clf.predict_proba(X_test_scaled)[:, 1]

In [11]:
#decision Tree Classifier
dt_clf = DecisionTreeClassifier(max_depth=6,min_samples_split=10,random_state=42)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_prob = dt_clf.predict_proba(X_test)[:, 1]

In [12]:
#random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_prob = rf_clf.predict_proba(X_test)[:, 1]

In [13]:
#xgboost Classifier
from xgboost import XGBClassifier
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
xgb_pred = xgb_clf.predict(X_test)
xgb_prob = xgb_clf.predict_proba(X_test)[:, 1]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
def evaluate_model(name, y_test, y_pred, y_prob):
    print(f"\n{name}")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    print("ROC-AUC  :", roc_auc_score(y_test, y_prob))

In [15]:
#Evaluate All Models
evaluate_model("Logistic Regression", y_test, logreg_pred, logreg_prob)
evaluate_model("SVM Classifier", y_test, svm_pred, svm_prob)
evaluate_model("Decision Tree Classifier", y_test, dt_pred, dt_prob)
evaluate_model("Random Forest Classifier", y_test, rf_pred, rf_prob)
evaluate_model("XGBoost Classifier", y_test, xgb_pred, xgb_prob)


Logistic Regression
Accuracy : 0.7532467532467533
Precision: 0.6666666666666666
Recall   : 0.6181818181818182
F1 Score : 0.6415094339622641
ROC-AUC  : 0.82277318640955

SVM Classifier
Accuracy : 0.7467532467532467
Precision: 0.6666666666666666
Recall   : 0.5818181818181818
F1 Score : 0.6213592233009708
ROC-AUC  : 0.8086317722681359

Decision Tree Classifier
Accuracy : 0.7272727272727273
Precision: 0.5942028985507246
Recall   : 0.7454545454545455
F1 Score : 0.6612903225806451
ROC-AUC  : 0.7907254361799816

Random Forest Classifier
Accuracy : 0.7337662337662337
Precision: 0.6206896551724138
Recall   : 0.6545454545454545
F1 Score : 0.6371681415929203
ROC-AUC  : 0.82910927456382

XGBoost Classifier
Accuracy : 0.7142857142857143
Precision: 0.5873015873015873
Recall   : 0.6727272727272727
F1 Score : 0.6271186440677966
ROC-AUC  : 0.7774104683195593


In [16]:
#compare the models based on their performance metrics and choose the best one for deployment or further tuning.
results = pd.DataFrame({
    "Model": ["Logistic Regression","SVM", "Decision Tree Classifier", "Random Forest Classifier", "XGBoost Classifier"],
    "Accuracy": [
        accuracy_score(y_test, logreg_pred),
        accuracy_score(y_test, dt_pred),
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, svm_pred),
        accuracy_score(y_test, xgb_pred)
    ],
    "ROC-AUC": [
        roc_auc_score(y_test, logreg_prob),
        roc_auc_score(y_test, dt_prob),
        roc_auc_score(y_test, rf_prob),
        roc_auc_score(y_test, svm_prob),
        roc_auc_score(y_test, xgb_prob)
    ]
})

results



Unnamed: 0,Model,Accuracy,ROC-AUC
0,Logistic Regression,0.753247,0.822773
1,SVM,0.727273,0.790725
2,Decision Tree Classifier,0.733766,0.829109
3,Random Forest Classifier,0.746753,0.808632
4,XGBoost Classifier,0.714286,0.77741


In [17]:
#combine all methods to get a good result 
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('logreg', logreg),
        ('svm', svm_clf),
        ('dt', dt_clf),
        ('rf', rf_clf),
        #('xgb', xgb_clf)

    ],
    voting='soft'
)
voting_clf.fit(X_train_scaled, y_train)
voting_pred = voting_clf.predict(X_test_scaled)
voting_prob = voting_clf.predict_proba(X_test_scaled)[:, 1]
evaluate_model("Voting Classifier", y_test, voting_pred, voting_prob)




Voting Classifier
Accuracy : 0.7597402597402597
Precision: 0.6666666666666666
Recall   : 0.6545454545454545
F1 Score : 0.6605504587155964
ROC-AUC  : 0.8426078971533517
