# Bank Marketing - SMOTE + F1 Pipeline

This notebook rebuilds the training workflow with SMOTE, cross-validated model selection (LogReg, RandomForest, XGBoost if available), and decision-threshold tuning to maximize F1. Artifacts are saved under `artifacts/` for the Streamlit app.


In [None]:
import json
import time
import warnings
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_predict,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

warnings.filterwarnings("ignore")

BASE_DIR = Path.cwd()
DATA_PATH = BASE_DIR / "bank-additional.csv"
ARTIFACT_DIR = BASE_DIR / "artifacts"
MODEL_PATH = ARTIFACT_DIR / "final_model.pkl"
METRICS_PATH = ARTIFACT_DIR / "metrics.json"
RANDOM_STATE = 42

ARTIFACT_DIR.mkdir(exist_ok=True)

try:
    from xgboost import XGBClassifier

    HAS_XGB = True
except Exception:
    HAS_XGB = False

print(f"Using base dir: {BASE_DIR}")
print(f"XGBoost available: {HAS_XGB}")


In [None]:
df_raw = pd.read_csv(DATA_PATH, sep=";")
df = df_raw.copy()

# Basic cleaning
df = df.replace("unknown", np.nan)
df["was_contacted"] = (df["pdays"] != 999).astype(int)
df["pdays"] = df["pdays"].replace(999, -1)
df["y"] = df["y"].map({"no": 0, "yes": 1})

cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in df.columns if c not in cat_cols + ["y"]]

print(df.head())
print("\nRows:", len(df))
print("Positive rate:", df["y"].mean())
print("Categorical columns:", len(cat_cols))
print("Numeric columns:", len(num_cols))


In [None]:
X = df.drop(columns=["y"])
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print(
    f"Train size: {X_train.shape}, Test size: {X_test.shape}, Train positive rate: {y_train.mean():.3f}"
)


In [None]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
print("Preprocess and CV ready")


In [None]:
model_spaces = [
    (
        "logreg",
        LogisticRegression(max_iter=600, solver="liblinear"),
        {
            "clf__C": [0.1, 0.5, 1.0, 2.0, 5.0],
            "clf__penalty": ["l2"],
            "clf__class_weight": [None, "balanced"],
        },
    ),
    (
        "rf",
        RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
        {
            "clf__n_estimators": [150, 250, 400],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4],
            "clf__max_features": ["sqrt", "log2", None],
        },
    ),
]

if HAS_XGB:
    model_spaces.append(
        (
            "xgb",
            XGBClassifier(
                eval_metric="logloss",
                random_state=RANDOM_STATE,
                n_jobs=-1,
                tree_method="hist",
            ),
            {
                "clf__n_estimators": [200, 400],
                "clf__max_depth": [3, 5, 7],
                "clf__learning_rate": [0.03, 0.1, 0.2],
                "clf__subsample": [0.8, 1.0],
                "clf__colsample_bytree": [0.8, 1.0],
                "clf__reg_lambda": [1.0, 3.0, 5.0],
            },
        )
    )

search_results = []

for name, clf, param_grid in model_spaces:
    pipe = ImbPipeline(
        steps=[("preprocess", preprocess), ("smote", SMOTE(random_state=RANDOM_STATE)), ("clf", clf)]
    )

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_grid,
        n_iter=min(12, np.prod([len(v) for v in param_grid.values()])),
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1,
    )

    print(f"\nRunning search for {name} ...")
    start = time.time()
    search.fit(X_train, y_train)
    elapsed = time.time() - start
    print(f"Best F1 for {name}: {search.best_score_:.4f} in {elapsed/60:.1f} min")

    search_results.append({
        "name": name,
        "search": search,
        "cv_f1": search.best_score_,
        "best_params": search.best_params_,
        "elapsed_min": elapsed / 60,
    })

best_entry = max(search_results, key=lambda d: d["cv_f1"])
best_search = best_entry["search"]
best_estimator = best_search.best_estimator_

print("\nWinner:", best_entry["name"])
print("Best params:", best_entry["best_params"])
print("CV F1:", round(best_entry["cv_f1"], 4))


In [None]:
def tune_threshold(model, X, y, cv_obj, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 19)
    proba = cross_val_predict(model, X, y, cv=cv_obj, method="predict_proba", n_jobs=-1)[:, 1]
    scores = []
    for thr in grid:
        preds = (proba >= thr).astype(int)
        scores.append((thr, f1_score(y, preds)))
    best_thr, best_f1 = max(scores, key=lambda t: t[1])
    return best_thr, best_f1, pd.DataFrame(scores, columns=["threshold", "f1"])

best_threshold, best_thr_f1, threshold_df = tune_threshold(best_estimator, X_train, y_train, cv)
print(f"Best threshold on CV preds: {best_threshold:.2f} with F1 {best_thr_f1:.4f}")
threshold_df.head()


In [None]:
final_model = best_estimator.fit(X_train, y_train)

test_proba = final_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_threshold).astype(int)

test_f1 = f1_score(y_test, test_pred)
test_precision = precision_score(y_test, test_pred)
test_recall = recall_score(y_test, test_pred)
test_roc_auc = roc_auc_score(y_test, test_proba)
test_cm = confusion_matrix(y_test, test_pred).tolist()

print(
    f"Test F1: {test_f1:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, ROC-AUC: {test_roc_auc:.4f}"
)
print("Confusion matrix (rows=true, cols=pred):", test_cm)



In [None]:
metrics = {
    "model": best_entry["name"],
    "best_params": best_entry["best_params"],
    "cv_f1": float(best_entry["cv_f1"]),
    "threshold": float(best_threshold),
    "threshold_cv_f1": float(best_thr_f1),
    "test": {
        "f1": float(test_f1),
        "precision": float(test_precision),
        "recall": float(test_recall),
        "roc_auc": float(test_roc_auc),
        "confusion_matrix": test_cm,
    },
    "class_balance": {
        "train_yes_rate": float(y_train.mean()),
        "test_yes_rate": float(y_test.mean()),
    },
    "artifacts": {
        "model_path": str(MODEL_PATH),
        "metrics_path": str(METRICS_PATH),
    },
}

joblib.dump(final_model, MODEL_PATH)
METRICS_PATH.write_text(json.dumps(metrics, indent=2))

print(f"Saved model to {MODEL_PATH}")
print(f"Saved metrics to {METRICS_PATH}")
metrics
