# Pipeline workflow: EDA → preprocessing → DecisionTree & RandomForest training → Threshold tuning

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, average_precision_score, classification_report, confusion_matrix
import joblib
%matplotlib inline


In [None]:
# Load data
df = pd.read_csv('../data/US_Heart_Patients.csv')
print('Shape:', df.shape)
df.head(10)


In [None]:
# 5-point summary
display(df.describe())

# Info
print(df.info())

# Missing values per column
print(df.isnull().sum())

# Outlier counts (IQR method)
num_cols = df.select_dtypes(include=['number']).columns.tolist()
outlier_counts = {}
for c in num_cols:
    q1 = df[c].quantile(0.25)
    q3 = df[c].quantile(0.75)
    iqr = q3 - q1
    low = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    outlier_counts[c] = int(((df[c] < low) | (df[c] > high)).sum())
print(pd.Series(outlier_counts).sort_values(ascending=False))


In [None]:
# Correlation heatmap
corr = df.select_dtypes(include=['number']).corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu', center=0)
plt.title('Correlation matrix (numeric features)')
plt.show()

# Distributions
num_cols = df.select_dtypes(include=['number']).columns.tolist()
df[num_cols].hist(bins=20, figsize=(16,12))
plt.suptitle('Numeric feature distributions')
plt.show()


In [None]:
# Feature engineering
if 'glucose' in df.columns:
    df['glucose_missing'] = df['glucose'].isnull().astype(int)

# Winsorization
winsor_cols = ['tot cholesterol','Systolic BP','Diastolic BP','BMI','glucose']
for c in winsor_cols:
    if c in df.columns:
        df[c] = df[c].clip(df[c].quantile(0.01), df[c].quantile(0.99))

# Prepare X, y
TARGET = 'Heart-Att'
X = df.drop(columns=[TARGET])
y = df[TARGET]
if y.dtype == 'object' or y.dtype.name == 'category':
    y = LabelEncoder().fit_transform(y.astype(str))

numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = [c for c in X.columns if c not in numeric_cols]
print('Numeric cols:', numeric_cols)
print('Categorical cols:', cat_cols)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=13)

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preproc = ColumnTransformer([('num', num_pipe, numeric_cols), ('cat', cat_pipe, cat_cols)])


In [None]:
# Decision Tree baseline + hyperparameter tuning
from sklearn.model_selection import StratifiedKFold

dt_pipe = Pipeline([('preprocessor', preproc), ('clf', DecisionTreeClassifier(random_state=13))])

dt_param_grid = {
    'clf__max_depth': [3,5,7,None],
    'clf__min_samples_split': [2,5,10],
    'clf__min_samples_leaf': [1,2,5]
}

dt_grid = GridSearchCV(dt_pipe, dt_param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=13), scoring='f1', n_jobs=-1, verbose=1)
print('Starting Decision Tree GridSearch...')
dt_grid.fit(X_train, y_train)
dt_best = dt_grid.best_estimator_
print('Best DT params:', dt_grid.best_params_)

# Evaluate
y_train_pred = dt_best.predict(X_train)
y_test_pred = dt_best.predict(X_test)
print('Train F1:', f1_score(y_train, y_train_pred))
print('Test F1:', f1_score(y_test, y_test_pred))
print('Confusion matrix (test):')
print(confusion_matrix(y_test, y_test_pred))
print('Classification report (test):')
print(classification_report(y_test, y_test_pred))

# Save DT model
joblib.dump(dt_best, '../models//decision_tree_model.pkl')
print('Saved Decision Tree to ../models//decision_tree_model.pkl')


In [None]:
# RandomForest with GridSearchCV
rf_pipe = Pipeline([('preprocessor', preproc), ('clf', RandomForestClassifier(class_weight='balanced', random_state=13))])
rf_param_grid = {'clf__n_estimators':[100,200], 'clf__max_depth':[5,10,None], 'clf__min_samples_split':[5,10], 'clf__min_samples_leaf':[2,5]}
rf_grid = GridSearchCV(rf_pipe, rf_param_grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=13), scoring='f1', n_jobs=-1, verbose=1)

print('Starting RandomForest GridSearch...')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
print('Best RF params:', rf_grid.best_params_)

# Default evaluation @0.5
y_test_pred = rf_best.predict(X_test)
print('F1 Test @0.5:', f1_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

# Probabilities and PR curve
proba = rf_best.predict_proba(X_test)[:,1]
precision, recall, _ = precision_recall_curve(y_test, proba)
ap = average_precision_score(y_test, proba)
plt.figure(figsize=(7,6)); plt.plot(recall, precision, label=f'AP={ap:.3f}'); plt.xlabel('Recall'); plt.ylabel('Precision'); plt.legend(); plt.grid(True); plt.show()

# Threshold sweep
threshs = np.linspace(0.01,0.99,99)
rows = []
for t in threshs:
    preds = (proba >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    f1 = f1_score(y_test, preds, zero_division=0)
    cost = fp * 1 + fn * 10
    rows.append((t, tp, tn, fp, fn, prec, rec, f1, cost))
th_df = pd.DataFrame(rows, columns=['threshold','tp','tn','fp','fn','precision','recall','f1','cost'])
th_df.to_csv('../models/threshold_metrics.csv', index=False)

def pick_thresholds(df, target_recall=0.8):
    best_f1 = df.loc[df.f1.idxmax()]
    candidates = df[df.recall >= target_recall]
    best_recall = candidates.loc[candidates.precision.idxmax()] if not candidates.empty else None
    best_cost = df.loc[df.cost.idxmin()]
    return best_f1, best_recall, best_cost

best_f1, best_recall, best_cost = pick_thresholds(th_df)
print('Best F1 threshold:', best_f1.threshold, 'F1=', best_f1.f1)
if best_recall is not None:
    print('Recall-target threshold:', best_recall.threshold, 'recall=', best_recall.recall, 'precision=', best_recall.precision)
else:
    print('No threshold meets recall target')
print('Cost-min threshold:', best_cost.threshold, 'cost=', best_cost.cost)

# Save final RF model and selection
selection = {'threshold_optimal_f1': float(best_f1.threshold), 'threshold_recall_target': float(best_recall.threshold) if best_recall is not None else None, 'threshold_cost_min': float(best_cost.threshold)}
joblib.dump({'model': rf_best, 'threshold_selection': selection}, '../models/final_randomforest_model.pkl')
print('Saved RandomForest model and threshold selection to ../models/final_randomforest_model.pkl')


In [None]:
# Pruned tree of the RandomForest (one tree approximation not directly available from ensemble)
# We can visualize a single tree from the forest for interpretability
clf = rf_best.named_steps['clf'] if hasattr(rf_best, 'named_steps') else rf_best
# if pipeline, get classifier
if isinstance(clf, Pipeline):
    clf = clf.named_steps['clf']

# plot tree 0 from the ensemble
try:
    estimator = clf.estimators_[0]
    plt.figure(figsize=(18,10))
    plot_tree(estimator, max_depth=3, filled=True, fontsize=10)
    plt.tight_layout()
    os.makedirs("models", exist_ok=True)   # <-- ensure folder exists
    plt.savefig("models/tree_pruned.png", dpi=200)
    plt.show()
except Exception as e:
    print("Could not plot tree from ensemble:", e)
