In [None]:
#!/usr/bin/env python
# Heart‑Disease Prediction – Training script
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib
import matplotlib.pyplot as plt

# ---------- 1. LOAD DATA ----------
df = pd.read_csv("/content/dataset.csv")        # UCI Heart Disease dataset (303 rows)
target = "target"                         # 1=disease, 0=no disease
X, y = df.drop(columns=target), df[target]

# Identify column types
cat_cols = ["cp", "restecg", "slope", "thal", "ca"]     # categorical
num_cols = [c for c in X.columns if c not in cat_cols]  # numerical

# ---------- 2. PRE‑PROCESSING PIPELINE ----------
numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

# ---------- 3. MODEL PIPELINE ----------
clf = RandomForestClassifier(random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", clf)
])

param_grid = {
    "model__n_estimators": [200, 500],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

search = GridSearchCV(
    pipe, param_grid, cv=5, scoring="roc_auc", n_jobs=-1, verbose=1)

search.fit(X_train, y_train)
print(f"Best params: {search.best_params_}")

# ---------- 4. EVALUATION ----------
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report")
print(classification_report(y_test, y_pred))

roc = roc_auc_score(y_test, y_proba)
print(f"ROC‑AUC: {roc:.3f}")

# ROC Curve
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title("ROC – Heart Disease Classifier")
plt.show()

# ---------- 5. EXPORT ----------
joblib.dump(best_model, "heart_model.joblib")
print("✅  Model saved to heart_model.joblib")
