# Bank Marketing - SMOTE + F1 Pipeline

This notebook rebuilds the training workflow with SMOTE, cross-validated model selection (LogReg, RandomForest, XGBoost if available), and decision-threshold tuning to maximize F1. Artifacts are saved under `artifacts/` for the Streamlit app.


In [17]:
import json
import time
import warnings
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    cross_val_predict,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

warnings.filterwarnings("ignore")

BASE_DIR = Path.cwd()
DATA_PATH = BASE_DIR / "bank-additional.csv"
ARTIFACT_DIR = BASE_DIR / "artifacts"
MODEL_PATH = ARTIFACT_DIR / "final_model.pkl"
METRICS_PATH = ARTIFACT_DIR / "metrics.json"
RANDOM_STATE = 42

ARTIFACT_DIR.mkdir(exist_ok=True)

try:
    from xgboost import XGBClassifier

    HAS_XGB = True
except Exception:
    HAS_XGB = False

print(f"Using base dir: {BASE_DIR}")
print(f"XGBoost available: {HAS_XGB}")


Using base dir: /Users/kaancakir/ADA442Project
XGBoost available: False


In [20]:
df_raw = pd.read_csv(DATA_PATH, sep=";")

def engineer_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df_out = df_in.copy()
    df_out = df_out.replace("unknown", np.nan)
    df_out["was_contacted"] = (df_out["pdays"] != 999).astype(int)
    df_out["pdays"] = df_out["pdays"].replace(999, -1)
    # Feature engineering
    df_out["is_retired"] = (df_out["job"] == "retired").astype(int)
    df_out["eco_index"] = df_out["euribor3m"] * df_out["cons.conf.idx"]
    return df_out

# Basic cleaning + feature engineering
df = engineer_features(df_raw)
df["y"] = df["y"].map({"no": 0, "yes": 1})

cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in df.columns if c not in cat_cols + ["y"]]

print(df.head())
print("\nRows:", len(df))
print("Positive rate:", df["y"].mean())
print("Categorical columns:", len(cat_cols))
print("Numeric columns:", len(num_cols))


   age          job  marital          education default housing loan  \
0   30  blue-collar  married           basic.9y      no     yes   no   
1   39     services   single        high.school      no      no   no   
2   25     services  married        high.school      no     yes   no   
3   38     services  married           basic.9y      no     NaN  NaN   
4   47       admin.  married  university.degree      no     yes   no   

     contact month day_of_week  ...     poutcome  emp.var.rate  \
0   cellular   may         fri  ...  nonexistent          -1.8   
1  telephone   may         fri  ...  nonexistent           1.1   
2  telephone   jun         wed  ...  nonexistent           1.4   
3  telephone   jun         fri  ...  nonexistent           1.4   
4   cellular   nov         mon  ...  nonexistent          -0.1   

   cons.price.idx  cons.conf.idx euribor3m  nr.employed  y  was_contacted  \
0          92.893          -46.2     1.313       5099.1  0              0   
1          93.99

In [21]:
X = df.drop(columns=["y"])
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print(
    f"Train size: {X_train.shape}, Test size: {X_test.shape}, Train positive rate: {y_train.mean():.3f}"
)


Train size: (3295, 23), Test size: (824, 23), Train positive rate: 0.110


In [22]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
print("Preprocess and CV ready")


Preprocess and CV ready


In [23]:
model_spaces = [
    (
        "logreg",
        LogisticRegression(max_iter=800, solver="liblinear"),
        {
            "clf__C": list(np.logspace(-3, 2, 8)),
            "clf__penalty": ["l2"],
            "clf__class_weight": ["balanced"],
        },
    ),
    (
        "rf",
        RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight="balanced"),
        {
            "clf__n_estimators": [200, 300, 450],
            "clf__max_depth": [None, 12, 20],
            "clf__min_samples_leaf": [1, 2, 4],
            "clf__max_features": ["sqrt", "log2", None],
        },
    ),
]

if HAS_XGB:
    model_spaces.append(
        (
            "xgb",
            XGBClassifier(
                eval_metric="logloss",
                random_state=RANDOM_STATE,
                n_jobs=-1,
                tree_method="hist",
            ),
            {
                "clf__n_estimators": [200, 400],
                "clf__max_depth": [3, 5, 7],
                "clf__learning_rate": [0.03, 0.1, 0.2],
                "clf__subsample": [0.8, 1.0],
                "clf__colsample_bytree": [0.8, 1.0],
                "clf__reg_lambda": [1.0, 3.0, 5.0],
            },
        )
    )

search_results = []

for name, clf, param_grid in model_spaces:
    pipe = ImbPipeline(
        steps=[("preprocess", preprocess), ("smote", SMOTE(random_state=RANDOM_STATE)), ("clf", clf)]
    )

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_grid,
        n_iter=min(12, np.prod([len(v) for v in param_grid.values()])),
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1,
    )

    print(f"\nRunning search for {name} ...")
    start = time.time()
    search.fit(X_train, y_train)
    elapsed = time.time() - start
    print(f"Best F1 for {name}: {search.best_score_:.4f} in {elapsed/60:.1f} min")

    search_results.append({
        "name": name,
        "search": search,
        "cv_f1": search.best_score_,
        "best_params": search.best_params_,
        "elapsed_min": elapsed / 60,
    })

best_entry = max(search_results, key=lambda d: d["cv_f1"])
best_search = best_entry["search"]
best_estimator = best_search.best_estimator_

print("\nWinner:", best_entry["name"])
print("Best params:", best_entry["best_params"])
print("CV F1:", round(best_entry["cv_f1"], 4))



Running search for logreg ...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best F1 for logreg: 0.5823 in 0.1 min

Running search for rf ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best F1 for rf: 0.5895 in 0.5 min

Winner: rf
Best params: {'clf__n_estimators': 300, 'clf__min_samples_leaf': 2, 'clf__max_features': None, 'clf__max_depth': 12}
CV F1: 0.5895


In [24]:
def tune_threshold(model, X, y, cv_obj, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 19)
    proba = cross_val_predict(model, X, y, cv=cv_obj, method="predict_proba", n_jobs=-1)[:, 1]
    scores = []
    for thr in grid:
        preds = (proba >= thr).astype(int)
        scores.append((thr, f1_score(y, preds)))
    best_thr, best_f1 = max(scores, key=lambda t: t[1])
    return best_thr, best_f1, pd.DataFrame(scores, columns=["threshold", "f1"])

best_threshold, best_thr_f1, threshold_df = tune_threshold(best_estimator, X_train, y_train, cv)
print(f"Best threshold on CV preds: {best_threshold:.2f} with F1 {best_thr_f1:.4f}")
threshold_df.head()


Best threshold on CV preds: 0.45 with F1 0.6014


Unnamed: 0,threshold,f1
0,0.05,0.460817
1,0.1,0.493071
2,0.15,0.520031
3,0.2,0.539043
4,0.25,0.555752


In [25]:
final_model = best_estimator.fit(X_train, y_train)

test_proba = final_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_threshold).astype(int)

test_f1 = f1_score(y_test, test_pred)
test_precision = precision_score(y_test, test_pred)
test_recall = recall_score(y_test, test_pred)
test_roc_auc = roc_auc_score(y_test, test_proba)
test_cm = confusion_matrix(y_test, test_pred).tolist()

print(
    f"Test F1: {test_f1:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, ROC-AUC: {test_roc_auc:.4f}"
)
print("Confusion matrix (rows=true, cols=pred):", test_cm)



Test F1: 0.5905, Precision: 0.5167, Recall: 0.6889, ROC-AUC: 0.9394
Confusion matrix (rows=true, cols=pred): [[676, 58], [28, 62]]


In [26]:
metrics = {
    "model": best_entry["name"],
    "best_params": best_entry["best_params"],
    "cv_f1": float(best_entry["cv_f1"]),
    "threshold": float(best_threshold),
    "threshold_cv_f1": float(best_thr_f1),
    "test": {
        "f1": float(test_f1),
        "precision": float(test_precision),
        "recall": float(test_recall),
        "roc_auc": float(test_roc_auc),
        "confusion_matrix": test_cm,
    },
    "class_balance": {
        "train_yes_rate": float(y_train.mean()),
        "test_yes_rate": float(y_test.mean()),
    },
    "artifacts": {
        "model_path": str(MODEL_PATH),
        "metrics_path": str(METRICS_PATH),
    },
}

joblib.dump(final_model, MODEL_PATH)
METRICS_PATH.write_text(json.dumps(metrics, indent=2))

print(f"Saved model to {MODEL_PATH}")
print(f"Saved metrics to {METRICS_PATH}")
metrics


Saved model to /Users/kaancakir/ADA442Project/artifacts/final_model.pkl
Saved metrics to /Users/kaancakir/ADA442Project/artifacts/metrics.json


{'model': 'rf',
 'best_params': {'clf__n_estimators': 300,
  'clf__min_samples_leaf': 2,
  'clf__max_features': None,
  'clf__max_depth': 12},
 'cv_f1': 0.5894748038229133,
 'threshold': 0.44999999999999996,
 'threshold_cv_f1': 0.6013667425968109,
 'test': {'f1': 0.5904761904761905,
  'precision': 0.5166666666666667,
  'recall': 0.6888888888888889,
  'roc_auc': 0.9394187102633969,
  'confusion_matrix': [[676, 58], [28, 62]]},
 'class_balance': {'train_yes_rate': 0.10955993930197269,
  'test_yes_rate': 0.10922330097087378},
 'artifacts': {'model_path': '/Users/kaancakir/ADA442Project/artifacts/final_model.pkl',
  'metrics_path': '/Users/kaancakir/ADA442Project/artifacts/metrics.json'}}