# Import Relevant Libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import xgboost as xgb



# Import UCI Dataset

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 
  
# metadata 
print(cdc_diabetes_health_indicators.metadata) 
  
# variable information 
print(cdc_diabetes_health_indicators.variables) 


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

# Split into Training and Testing + Scale Data

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

results = []


In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score

def train_evaluate_model(name, model, X_test, y_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "ROC AUC Score": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]),
        "F1 Score": f1_score(y_test, y_pred),
    })


# Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

train_evaluate_model("Logistic Regression", log_reg, X_test, y_test)

Accuracy: 0.8658940397350994
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93     43739
           1       0.55      0.17      0.25      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.57      0.59     50736
weighted avg       0.83      0.87      0.83     50736

Confusion Matrix:
 [[42775   964]
 [ 5840  1157]]
ROC AUC Score: 0.8263596542959626


# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)

train_evaluate_model("Decision Tree", dt, X_test, y_test)

Accuracy: 0.7985848312835068
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88     43739
           1       0.30      0.34      0.31      6997

    accuracy                           0.80     50736
   macro avg       0.59      0.60      0.60     50736
weighted avg       0.81      0.80      0.80     50736

Confusion Matrix:
 [[38172  5567]
 [ 4652  2345]]
ROC AUC Score: 0.6028002898545393


# Random Forest 

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
train_evaluate_model("Random Forest", rf, X_test, y_test)

Accuracy: 0.8597642699463891
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92     43739
           1       0.48      0.17      0.26      6997

    accuracy                           0.86     50736
   macro avg       0.68      0.57      0.59     50736
weighted avg       0.82      0.86      0.83     50736

Confusion Matrix:
 [[42402  1337]
 [ 5778  1219]]
ROC AUC Score: 0.8015463561718956


# K-Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
train_evaluate_model("K-Nearest Neighbors", knn_clf, X_test, y_test)


Accuracy: 0.8468345947650583
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.91     43739
           1       0.40      0.21      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.64      0.58      0.59     50736
weighted avg       0.82      0.85      0.83     50736

Confusion Matrix:
 [[41498  2241]
 [ 5530  1467]]
ROC AUC Score: 0.7225754416023644


In [10]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()
train_evaluate_model("Naive Bayes", nb_clf, X_test, y_test)

Accuracy: 0.771996215704825
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.81      0.86     43739
           1       0.32      0.57      0.41      6997

    accuracy                           0.77     50736
   macro avg       0.62      0.69      0.63     50736
weighted avg       0.84      0.77      0.80     50736

Confusion Matrix:
 [[35211  8528]
 [ 3040  3957]]
ROC AUC Score: 0.7860311936556714


In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(random_state=42, eval_metric='logloss')
train_evaluate_model("XGBoost", xgb_clf, X_test, y_test)

ImportError: cannot import name 'XGBClassifier' from 'xgboost' (unknown location)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(random_state=42)
train_evaluate_model("AdaBoost", ada_clf, X_test, y_test)

Accuracy: 0.8653815830968149
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.93     43739
           1       0.53      0.19      0.28      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.58      0.60     50736
weighted avg       0.83      0.87      0.84     50736

Confusion Matrix:
 [[42587  1152]
 [ 5678  1319]]
ROC AUC Score: 0.8269608940946472


In [None]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="Accuracy", ascending=False))

                 Model  Accuracy  ROC AUC Score  F1 Score
5              XGBoost  0.866761       0.832147  0.264098
0  Logistic Regression  0.865894       0.826360  0.253784
6             AdaBoost  0.865382       0.826961  0.278623
2        Random Forest  0.859764       0.801546  0.255208
3  K-Nearest Neighbors  0.846835       0.722575  0.274078
1        Decision Tree  0.798585       0.602800  0.314575
4          Naive Bayes  0.771996       0.786031  0.406221


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_clf = XGBClassifier(random_state=42, eval_metric='logloss')

grid = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=param_grid, 
    scoring='roc_auc', 
    cv=3, 
    verbose=1, 
    n_jobs=-1
)

grid.fit(X_train, y_train.values.ravel())

print("Best Parameters:", grid.best_params_)
print("Best ROC AUC Score:", grid.best_score_)

best_xgb_clf = grid.best_estimator_

train_evaluate_model("Tuned XGBoost", best_xgb_clf, X_test, y_test)


NameError: name 'XGBClassifier' is not defined