In [1]:
# source: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

In [None]:
# =============================================
# STEP 1: IMPORTS
# =============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.metrics import ConfusionMatrixDisplay

np.random.seed(42)

obesity = pd.read_csv("obesity.csv")
# change the target name from "NObeyesdad" to "target"
obesity.rename(columns={"NObeyesdad": "Target"}, inplace=True)

target_map = {
    "Insufficient_Weight": 0,
    "Normal_Weight": 1,
    "Overweight_Level_I": 2,
    "Overweight_Level_II": 3,
    "Obesity_Type_I": 4,
    "Obesity_Type_II": 5,
    "Obesity_Type_III": 6
}

obesity["Target"] = obesity["Target"].map(target_map)

# =============================================
# STEP 2: DATA SPLIT (your setup)
# =============================================
X = obesity.drop(columns=["Target"])
y = obesity["Target"]

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =============================================
# STEP 3: PREPROCESSING
# =============================================
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

# =============================================
# STEP 4: MODEL PIPELINE
# =============================================
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_clf)
])

# =============================================
# STEP 5: HYPERPARAMETER TUNING
# =============================================
param_grid = {
    'classifier__n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'classifier__max_depth': [3, 4, 5, 6, 7, 8, 9],
    'classifier__learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2],
    'classifier__subsample': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1],
    'classifier__colsample_bytree': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

print("üîç Running Grid Search for XGBoost hyperparameters...")
grid_search.fit(X_train, y_train)

print("\n‚úÖ Best Parameters:")
print(grid_search.best_params_)
best_model = grid_search.best_estimator_

# =============================================
# STEP 6: PREDICTIONS USING BEST MODEL
# =============================================
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

# =============================================
# STEP 7: METRICS
# =============================================
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("\nüìä Performance Metrics:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision (macro): {prec:.4f}")
print(f"Recall (macro):    {rec:.4f}")
print(f"F1-score (macro):  {f1:.4f}")

# =============================================
# STEP 8: CONFUSION MATRIX (Publication-Grade)
# =============================================
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix", fontsize=14, weight='bold')
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.tight_layout()
plt.show()

# =============================================
# STEP 9: ROC AUC (One-vs-Rest, Publication-Grade)
# =============================================
n_classes = len(np.unique(y))
y_test_bin = label_binarize(y_test, classes=np.unique(y))

plt.figure(figsize=(7, 6))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
    auc_score = roc_auc_score(y_test_bin[:, i], y_proba[:, i])
    plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {auc_score:.3f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Chance')
plt.title('ROC Curves (One-vs-Rest)', fontsize=16, weight='bold')
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.legend(loc='lower right', fontsize=10, frameon=True)
plt.grid(alpha=0.3)
plt.tight_layout()

save_path = f"ROC_AUC_XGBoost.png"
plt.savefig(save_path, dpi=300, bbox_inches="tight")
plt.show()

print("\n‚úÖ Model evaluation completed with best-tuned XGBoost classifier.")


üîç Running Grid Search for XGBoost hyperparameters...
Fitting 5 folds for each of 48020 candidates, totalling 240100 fits


In [None]:
# Check train vs testing accuracy
# Example using your pipeline (best_model)
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:  {test_acc:.4f}")


In [None]:
# =============================================
# STEP 1: IMPORTS
# =============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve
)

# =============================================
# STEP 2: DATA SPLIT (your setup)
# =============================================
# X = obesity.drop(columns=["Target"])
# y = obesity["Target"]

num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =============================================
# STEP 3: PREPROCESSING
# =============================================
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

# =============================================
# STEP 4: MODEL PIPELINE
# =============================================
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_clf)
])

# =============================================
# STEP 5: TRAIN MODEL
# =============================================
pipeline.fit(X_train, y_train)

# =============================================
# STEP 6: PREDICTIONS
# =============================================
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]  # for ROC

# =============================================
# STEP 7: METRICS (Multiclass version)
# =============================================
from sklearn.preprocessing import label_binarize

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy:  {acc:.4f}")
print(f"Precision (macro): {prec:.4f}")
print(f"Recall (macro):    {rec:.4f}")
print(f"F1-score (macro):  {f1:.4f}")

# =============================================
# STEP 8: CONFUSION MATRIX
# =============================================
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix", fontsize=14, weight='bold')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# =============================================
# STEP 9: ROC AUC (One-vs-Rest, Publication-grade)
# =============================================
# Only if you want ROC for multiclass
n_classes = len(np.unique(y))
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_proba = pipeline.predict_proba(X_test)

plt.figure(figsize=(7, 6))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
    auc_score = roc_auc_score(y_test_bin[:, i], y_proba[:, i])
    plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {auc_score:.3f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Chance')
plt.title('ROC Curves (One-vs-Rest)', fontsize=16, weight='bold')
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.legend(loc='lower right', fontsize=10, frameon=True)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# =============================================
# STEP 8: SHAP FEATURE IMPORTANCE (Multiclass)
# =============================================
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="shap")

# Create output folder for plots 
os.makedirs("shap_plots", exist_ok=True)

# Transform test data
X_test_t = preprocessor.transform(X_test)

# Get readable feature names 
raw_feature_names = preprocessor.get_feature_names_out()
feature_names = [name.split("__")[-1] for name in raw_feature_names]

# Create SHAP Explainer
explainer = shap.Explainer(classifier, X_test_t, feature_names=feature_names)
shap_values = explainer(X_test_t)

# Helper to group features 
def get_base_name(name):
    return name.split("_")[0] if "_" in name else name

group_map = {name: get_base_name(name) for name in feature_names}

def collapse_shap_values(shap_values, feature_names, group_map):
    df = pd.DataFrame(np.abs(shap_values.values), columns=feature_names)
    df_grouped = df.T.groupby(df.columns.map(group_map)).mean().T  # future-proof
    return df_grouped

# Style setup (publication-grade) 
plt.rcParams.update({
    "figure.dpi": 300,
    "savefig.dpi": 300,
    "font.size": 12,
    "axes.labelsize": 12,
    "axes.titlesize": 13,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "figure.figsize": (8, 5),
    "axes.spines.top": False,
    "axes.spines.right": False,
    "font.family": "sans-serif",
    "font.sans-serif": "Arial"
})

# Loop through each class 
class_names = np.unique(y_train)
n_classes = shap_values.values.shape[2]

print(f" Detected multiclass target with {n_classes} classes.")

for i, class_name in enumerate(class_names):
    df_grouped = collapse_shap_values(shap_values[:, :, i], feature_names, group_map)
    mean_shap = df_grouped.mean().sort_values(ascending=False)

    fig, ax = plt.subplots(figsize=(8, 5))
    mean_shap[:15].plot(kind="barh", ax=ax, color="#4472C4", edgecolor="black")
    ax.invert_yaxis()

    ax.set_title(f"Mean |SHAP| Values ‚Äî Class {class_name}", pad=12)
    ax.set_xlabel("Mean Absolute SHAP Value", labelpad=8)
    ax.set_ylabel("Feature", labelpad=8)

    plt.tight_layout()

    # Save image 
    save_path = f"shap_plots/SHAP_importance_class_{class_name}.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")

    # Show image interactively 
    plt.show()