
# Diabetes Risk Model — Clean, Progressive Notebook

This notebook is a **cleaned, step-by-step** version that rebuilds the final model from scratch with a clear flow:

1. **Setup & Data Load**
2. **Quick EDA**
3. **Train/Test Split**
4. **Preprocessing Pipeline**
5. **Baseline Model (Logistic Regression)**
6. **XGBoost (Core Model)**
7. **Threshold Tuning (0.20 – 0.40 zoom)**
8. **Final Evaluation at Chosen Threshold (0.301)**
9. **Save Pipeline for GUI**
10. **Example Inference**

> Tip: Run the cells top-to-bottom. Where training could take longer, it's labeled.


## 1) Setup & Data Load

In [None]:

# Adjust the path below if needed.
import pandas as pd

DATA_PATH = "../data/diabetes_prediction_dataset.csv"  # <- update if your path differs
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()


## 2) Quick EDA

In [None]:

# Basic info
df.info()


In [None]:

# Class balance
df['diabetes'].value_counts(normalize=True).rename({0:'No',1:'Yes'}) * 100


In [None]:

# Summary stats of numerics
df.describe()


In [None]:

# Simple histograms for numeric features (matplotlib only)
import matplotlib.pyplot as plt

numeric_cols = ['age','bmi','HbA1c_level','blood_glucose_level']
for col in numeric_cols:
    plt.figure(figsize=(5,3))
    df[col].hist(bins=30)
    plt.title(f"Distribution: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()


## 3) Train/Test Split

In [None]:

from sklearn.model_selection import train_test_split

FEATURES = ['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level','gender','smoking_history']
TARGET = 'diabetes'

X = df[FEATURES].copy()
y = df[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.head()


## 4) Preprocessing Pipeline

In [None]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_features = ['age','bmi','HbA1c_level','blood_glucose_level']
categorical_features = ['gender','smoking_history']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
preprocessor


## 5) Baseline Model (Logistic Regression)

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logreg_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

logreg_pipe.fit(X_train, y_train)
y_pred_lr = logreg_pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, digits=2))


## 6) XGBoost (Core Model)

In [None]:

# NOTE: This uses default-ish params; you can paste in your tuned params if you prefer.
from xgboost import XGBClassifier

xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(
        eval_metric='logloss',
        random_state=42
    ))
])

xgb_pipe.fit(X_train, y_train)
# Probability for class 1
y_proba_xgb = xgb_pipe.predict_proba(X_test)[:,1]

# Default 0.5 threshold prediction for reference
import numpy as np
y_pred_xgb_default = (y_proba_xgb >= 0.5).astype(int)

print("Accuracy (thr=0.5):", accuracy_score(y_test, y_pred_xgb_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_default))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb_default, digits=2))


## 7) Threshold Tuning (zoom 0.20–0.40)

In [None]:

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

thresholds = np.arange(0.20, 0.401, 0.01)
prec, rec, f1s = [], [], []

for t in thresholds:
    preds = (y_proba_xgb >= t).astype(int)
    prec.append(precision_score(y_test, preds))
    rec.append(recall_score(y_test, preds))
    f1s.append(f1_score(y_test, preds))

plt.figure(figsize=(8,5))
plt.plot(thresholds, prec, label='Precision')
plt.plot(thresholds, rec, label='Recall')
plt.plot(thresholds, f1s, label='F1')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Threshold vs Precision/Recall/F1 (XGBoost)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Choose your tuned threshold here (update if desired)
TUNED_THRESHOLD = 0.301
TUNED_THRESHOLD


## 8) Final Evaluation @ Tuned Threshold

In [None]:

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred_xgb_tuned = (y_proba_xgb >= TUNED_THRESHOLD).astype(int)

print(f"Accuracy (thr={TUNED_THRESHOLD}):", accuracy_score(y_test, y_pred_xgb_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb_tuned, digits=2))


## 9) Save Pipeline for GUI

In [None]:

import joblib

OUTPUT_MODEL_PATH = "../models/diabetes_pipeline.pkl"  # adjust if needed
joblib.dump(xgb_pipe, OUTPUT_MODEL_PATH)

print("Saved pipeline to:", OUTPUT_MODEL_PATH)


## 10) Example Inference

In [None]:

# Provide a realistic example (adjust values as you like)
example = pd.DataFrame([{
    'age': 52,
    'hypertension': 0,
    'heart_disease': 0,
    'bmi': 28.7,
    'HbA1c_level': 6.1,
    'blood_glucose_level': 145,
    'gender': "Male",
    'smoking_history': "never"
}])

proba = xgb_pipe.predict_proba(example)[0,1]
pred = int(proba >= TUNED_THRESHOLD)

print(f"Probability: {proba:.3f}")
print("Prediction:", "Diabetic" if pred==1 else "Not Diabetic")
