In [2]:
# =========================
# Task 1 – Optimized Pipeline for Bank Marketing (Term Deposit)
# Works with: bank-additional-full.csv (semicolon-delimited)
# =========================

# ---- Imports
import os, json, textwrap, warnings, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report, f1_score, roc_auc_score, roc_curve,
    precision_recall_curve, accuracy_score, precision_score, recall_score
)

warnings.filterwarnings("ignore")
plt.rcParams["figure.dpi"] = 120

# ---- Paths / output dirs
DATA_CSV = "bank-additional-full.csv"        # path to your CSV
PLOTS_DIR = "task1_plots_opt"
ARTIFACTS_DIR = "task1_artifacts"
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# =========================
# 1) Load & Prepare
# =========================
df = pd.read_csv(DATA_CSV, sep=';')
df['y'] = df['y'].map({'yes':1, 'no':0})

X = df.drop(columns=['y'])
y = df['y']

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

print(f"Rows: {len(df)} | Positives: {int(y.sum())} ({y.mean():.2%})")
print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

# Split 70/15/15
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.1765, random_state=42, stratify=y_train_full
)

# =========================
# 2) Preprocessing (fixed OneHotEncoder)
# =========================
# Compatibility: sklearn>=1.2 uses sparse_output, older uses sparse
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

pre_lr = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('scaler', StandardScaler())]), numeric_cols),
        ('cat', ohe, categorical_cols),
    ],
    verbose_feature_names_out=False
)

pre_rf = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', ohe, categorical_cols),
    ],
    verbose_feature_names_out=False
)

# =========================
# 3) Build Models
# =========================
pipe_lr = Pipeline([
    ('preprocess', pre_lr),
    ('model', LogisticRegression(max_iter=4000, class_weight='balanced', C=2.0, solver='lbfgs', random_state=42))
])

pipe_rf = Pipeline([
    ('preprocess', pre_rf),
    ('model', RandomForestClassifier(
        n_estimators=500, max_depth=None,
        min_samples_split=4, min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced_subsample',
        n_jobs=-1, random_state=42
    ))
])

# =========================
# 4) Train & Select Best Model (Validation AUC)
# =========================
pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)

val_auc_lr = roc_auc_score(y_val, pipe_lr.predict_proba(X_val)[:,1])
val_auc_rf = roc_auc_score(y_val, pipe_rf.predict_proba(X_val)[:,1])

if val_auc_lr >= val_auc_rf:
    best_estimator = pipe_lr
    best_name = "Logistic Regression"
    best_val_auc = val_auc_lr
else:
    best_estimator = pipe_rf
    best_name = "Random Forest"
    best_val_auc = val_auc_rf

print(f"Validation AUC — LR: {val_auc_lr:.4f} | RF: {val_auc_rf:.4f}")
print(f"Selected: {best_name} (Val AUC={best_val_auc:.4f})")

# =========================
# 5) Threshold Tuning
# =========================
y_val_proba = best_estimator.predict_proba(X_val)[:,1]
prec, rec, thr = precision_recall_curve(y_val, y_val_proba)
f1s = 2*(prec*rec) / (prec+rec + 1e-12)
best_thr_idx = int(np.nanargmax(f1s))
best_threshold = 0.5 if best_thr_idx >= len(thr) else float(thr[best_thr_idx])
print(f"Best threshold (F1 on val): {best_threshold:.3f}")

# =========================
# 6) Refit on Train+Val, Evaluate on Test
# =========================
best_estimator.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
y_test_proba = best_estimator.predict_proba(X_test)[:,1]
y_test_pred_default = (y_test_proba >= 0.5).astype(int)
y_test_pred_tuned   = (y_test_proba >= best_threshold).astype(int)

def eval_pack(y_true, y_pred, y_prob, label):
    return {
        "label": label,
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred)),
        "f1": float(f1_score(y_true, y_pred)),
        "auc": float(roc_auc_score(y_true, y_prob)),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(y_true, y_pred, digits=4)
    }

m_default = eval_pack(y_test, y_test_pred_default, y_test_proba, "Default 0.5")
m_tuned   = eval_pack(y_test, y_test_pred_tuned,   y_test_proba, f"Tuned {best_threshold:.3f}")

print("==== Test (Default) ====")
print(m_default["classification_report"])
print("==== Test (Tuned) ====")
print(m_tuned["classification_report"])

# =========================
# 7) Plots
# =========================
# Confusion matrices
for m in [m_default, m_tuned]:
    fig, ax = plt.subplots(figsize=(4,4))
    ConfusionMatrixDisplay(confusion_matrix=np.array(m["confusion_matrix"]), display_labels=["No","Yes"]).plot(
        ax=ax, values_format="d", colorbar=False
    )
    ax.set_title(f"{best_name} – {m['label']}")
    fig.savefig(os.path.join(PLOTS_DIR, f"confusion_{m['label'].replace(' ','_')}.png"))
    plt.close(fig)

# ROC
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(fpr, tpr, label=f"{best_name} (AUC={roc_auc_score(y_test, y_test_proba):.3f})")
ax.plot([0,1],[0,1],'--')
ax.set_xlabel("False Positive Rate"); ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curve (Test)")
ax.legend()
fig.savefig(os.path.join(PLOTS_DIR, "roc_curve.png"))
plt.close(fig)

# Precision-Recall (Val)
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(rec, prec, label="PR (val)")
ax.set_xlabel("Recall"); ax.set_ylabel("Precision")
ax.set_title("Precision-Recall (Validation)")
ax.legend()
fig.savefig(os.path.join(PLOTS_DIR, "precision_recall_val.png"))
plt.close(fig)

# =========================
# 8) Feature Importances
# =========================
try:
    feat_names = best_estimator.named_steps['preprocess'].get_feature_names_out()
    model = best_estimator.named_steps['model']
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
    else:
        importances = np.abs(model.coef_.ravel())

    top_idx = np.argsort(importances)[-20:][::-1]
    top_feats = np.array(feat_names)[top_idx]
    top_vals  = importances[top_idx]

    fig, ax = plt.subplots(figsize=(8,6))
    ax.barh(range(len(top_feats)), top_vals)
    ax.set_yticks(range(len(top_feats))); ax.set_yticklabels(top_feats)
    ax.invert_yaxis()
    ax.set_title("Top Features")
    fig.savefig(os.path.join(PLOTS_DIR, "top_features.png"))
    plt.close(fig)
except Exception as e:
    print("Feature importance plot skipped:", e)

# =========================
# 9) Save Artifacts
# =========================
best_model_path = os.path.join(ARTIFACTS_DIR, "best_model_pipeline.joblib")
joblib.dump(best_estimator, best_model_path)

meta = {
    "best_model": best_name,
    "validation_auc": float(best_val_auc),
    "validation_best_threshold": float(best_threshold),
    "metrics_test_default": m_default,
    "metrics_test_tuned": m_tuned,
}
metrics_json_path = os.path.join(ARTIFACTS_DIR, "metrics_and_config.json")
with open(metrics_json_path, "w") as f: json.dump(meta, f, indent=2)

# Simple CLI inference
inference_py_path = os.path.join(ARTIFACTS_DIR, "inference.py")
with open(inference_py_path, "w") as f:
    f.write(f"""
import joblib, pandas as pd, sys
THRESHOLD = {best_threshold:.6f}
pipe = joblib.load('best_model_pipeline.joblib')
df = pd.read_csv(sys.argv[1], sep=';')
if 'y' in df.columns: df = df.drop(columns=['y'])
p = pipe.predict_proba(df)[:,1]
out = pd.DataFrame({{'proba':p, 'pred':(p>=THRESHOLD).astype(int)}})
out.to_csv('predictions.csv', index=False)
print('Saved predictions.csv')
""")

print("\n=== Done! Check folders ===")
print("Plots ->", PLOTS_DIR)
print("Artifacts ->", ARTIFACTS_DIR)


Rows: 41188 | Positives: 4640 (11.27%)
Numeric cols: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical cols: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
Validation AUC — LR: 0.9367 | RF: 0.9439
Selected: Random Forest (Val AUC=0.9439)
Best threshold (F1 on val): 0.398
==== Test (Default) ====
              precision    recall  f1-score   support

           0     0.9671    0.9371    0.9518      5483
           1     0.6016    0.7486    0.6671       696

    accuracy                         0.9158      6179
   macro avg     0.7843    0.8428    0.8095      6179
weighted avg     0.9259    0.9158    0.9198      6179

==== Test (Tuned) ====
              precision    recall  f1-score   support

           0     0.9820    0.9042    0.9415      5483
           1     0.5354    0.8693    0.6627       696

    accuracy                     