In [13]:
pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [14]:
# Diabetes Model Comparison

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [15]:
# Load your own dataset
df = pd.read_csv('diabetes dataset pbl.csv')  # Change the filename if needed
print("Dataset Loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())


Dataset Loaded. Shape: (768, 9)
Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [16]:
# Step 3: Feature and Target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [17]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Step 5: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Step 6: Model Definitions
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

In [20]:
# Step 7: Training and Evaluating Models
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    cv_score = cross_val_score(model, X, y, cv=5).mean()
    results.append({
        'Model': name,
        'Accuracy (Test Set)': round(accuracy * 100, 2),
        'Cross-Val Accuracy (CV=5)': round(cv_score * 100, 2)
    })

[LightGBM] [Info] Number of positive: 213, number of negative: 401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346906 -> initscore=-0.632669
[LightGBM] [Info] Start training from score -0.632669
[LightGBM] [Info] Number of positive: 214, number of negative: 400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 667
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.348534 -> initscore=-0.625489
[LightGBM] [Info] Start training from score -0.625489
[LightGBM] [Info] Number of 

In [21]:
# Step 8: Results as DataFrame
results_df = pd.DataFrame(results).sort_values(by='Cross-Val Accuracy (CV=5)', ascending=False)
print(results_df)

                 Model  Accuracy (Test Set)  Cross-Val Accuracy (CV=5)
5             CatBoost                75.32                      77.09
0  Logistic Regression                75.32                      76.96
2        Random Forest                74.68                      76.44
1                  SVM                73.38                      75.91
4             LightGBM                70.78                      74.74
3              XGBoost                72.08                      74.10
