In [None]:
# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix
)
from imblearn.over_sampling import SMOTE

# -------------------------------
# 2. Load Data
# -------------------------------
data = pd.read_csv("diabetic_data.csv")

# -------------------------------
# 3. Cleaning
# -------------------------------
data = data.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)
data = data.replace("?", np.nan)
data = data.drop(['encounter_id', 'patient_nbr'], axis=1)

# -------------------------------
# 4. Target Variable
# -------------------------------
data['readmitted'] = data['readmitted'].replace({'>30': 0, 'NO': 0, '<30': 1})

# -------------------------------
# 5. Encode Categorical Variables
# -------------------------------
categorical_cols = data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col].astype(str))

# -------------------------------
# 6. Features & Labels
# -------------------------------
X = data.drop('readmitted', axis=1)
y = data['readmitted']

# -------------------------------
# 7. Handle Class Imbalance with SMOTE
# -------------------------------
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# -------------------------------
# 8. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# -------------------------------
# 9. Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------
# 10. Train Random Forest
# -------------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Save model & scaler for deployment
joblib.dump(rf, "rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_names.pkl")

# -------------------------------
# 11. Evaluation
# -------------------------------
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

print(f"\n✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"✅ Precision: {precision_score(y_test, y_pred):.4f}")
print(f"✅ Recall: {recall_score(y_test, y_pred):.4f}")
print(f"✅ F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"✅ ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")

# -------------------------------
# 12. SHAP Explainability (Optimized for Speed)
# -------------------------------
# -------------------------------
# 12. SHAP Explainability (Optimized)
# -------------------------------
print("\n🔎 Running SHAP explainability (optimized)...")

# Convert scaled test/train back to DataFrames
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Background dataset (small sample from train)
background = X_train_df.sample(100, random_state=42)

# Test sample (smaller for speed)
X_sample = X_test_df.sample(50, random_state=42)

# Create explainer with background dataset
explainer = shap.TreeExplainer(rf, data=background)

# Compute SHAP values (raw output is default and fastest)
shap_values = explainer.shap_values(X_sample)

# --- SHAP Summary Plot (Global Importance) ---
shap.summary_plot(shap_values[1], X_sample, show=False)  # [1] for readmission=1
plt.title("SHAP Summary Plot - Random Forest")
plt.savefig("shap_summary_rf.png", bbox_inches="tight")
plt.close()

# --- SHAP Waterfall Plot (Local Explanation for 1st sample) ---
shap.plots._waterfall.waterfall_legacy(
    explainer.expected_value[1], shap_values[1][0], X_sample.iloc[0], show=False
)
plt.savefig("shap_waterfall_rf.png", bbox_inches="tight")
plt.close()

print("✅ SHAP plots saved: shap_summary_rf.png & shap_waterfall_rf.png (fast mode)")

# -------------------------------
# 13. Streamlit Dashboard
# -------------------------------
# Save this code separately in `app.py` and run: streamlit run app.py
"""
import streamlit as st
import pandas as pd
import joblib

# Load model, scaler, and features
model = joblib.load("rf_model.pkl")
scaler = joblib.load("scaler.pkl")
feature_names = joblib.load("feature_names.pkl")

st.title("🏥 Diabetes Readmission Prediction Dashboard")

st.write("Enter patient details to predict hospital readmission:")

# Input fields
inputs = {}
for col in feature_names:
    inputs[col] = st.number_input(f"{col}", value=0.0)

# Convert to DataFrame
input_df = pd.DataFrame([inputs])

# Scale
input_scaled = scaler.transform(input_df)

# Prediction
pred_prob = model.predict_proba(input_scaled)[0][1]
prediction = model.predict(input_scaled)[0]

st.subheader("Prediction Result")
if prediction == 1:
    st.error(f"⚠️ Patient likely to be readmitted (probability: {pred_prob:.2f})")
else:
    st.success(f"✅ Patient unlikely to be readmitted (probability: {pred_prob:.2f})")
"""


  data['readmitted'] = data['readmitted'].replace({'>30': 0, 'NO': 0, '<30': 1})



Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.91     18082
           1       0.95      0.86      0.90     18082

    accuracy                           0.91     36164
   macro avg       0.91      0.91      0.91     36164
weighted avg       0.91      0.91      0.91     36164


Confusion Matrix:
 [[17196   886]
 [ 2453 15629]]

✅ Accuracy: 0.9077
✅ Precision: 0.9464
✅ Recall: 0.8643
✅ F1 Score: 0.9035
✅ ROC-AUC: 0.9548

🔎 Running SHAP explainability (optimized)...
