In [None]:
import os
import copy
import mlflow
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tabulate import tabulate
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble  import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, FunctionTransformer, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix,
    roc_curve, precision_recall_curve
)
from imblearn.over_sampling import SMOTE

In [2]:
class ReduceMemUsage:
    """Down-casts numeric cols in-place to save RAM."""
    def __call__(self, df: pd.DataFrame):
        for col in df:
            if df[col].dtype == 'float64':
                df[col] = pd.to_numeric(df[col], downcast='float')
            elif df[col].dtype == 'int64':
                df[col] = pd.to_numeric(df[col], downcast='integer')
        return df

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    """
    Replaces categories by their train-set frequency (0-1 float).
    If `cols` is None we auto-select every *non-numeric* column.
    """
    def __init__(self, cols=None): self.cols = cols
    def fit(self, X, y=None):
        if self.cols is None:
            self.cols = X.select_dtypes(exclude=np.number).columns.tolist()
        self.maps_ = {c: X[c].value_counts(normalize=True) for c in self.cols}
        return self
    def transform(self, X):
        X = X.copy()
        for c, m in self.maps_.items():
            X[c] = X[c].map(m).astype('float32')
        return X[self.cols]          # keep only encoded cols

class PurgedKFold(StratifiedKFold):
    """Time-based CV that removes an embargo window around the val period."""
    def __init__(self, n_splits=5, embargo_pct=0.01, **kw):
        super().__init__(n_splits=n_splits, shuffle=False, **kw)
        self.embargo_pct = embargo_pct

    def split(self, X, y=None, groups=None):
        # X must be sorted by time already
        n = len(X); embargo = int(n * self.embargo_pct)
        fold_sizes = np.full(self.n_splits, n // self.n_splits, dtype=int)
        fold_sizes[: n % self.n_splits] += 1
        idx = 0
        for sz in fold_sizes:
            start, stop = idx, idx + sz
            test_idx = np.arange(start, stop)
            train_idx = np.r_[0:max(0,start-embargo), stop+embargo:n]
            idx = stop
            yield train_idx, test_idx


In [3]:
df_id  = pd.read_csv('../datasets/train_identity.csv')
df_trx = pd.read_csv('../datasets/train_transaction.csv')
df     = pd.merge(df_trx, df_id, on='TransactionID', how='left')
df     = ReduceMemUsage()(df)           # saves ≈40 % RAM

# Chronological split (80 / 20)
cutoff  = df.TransactionDT.quantile(.8)
train   = df[df.TransactionDT <= cutoff].reset_index(drop=True)
valid   = df[df.TransactionDT  > cutoff].reset_index(drop=True)

In [4]:
X_tr, y_tr = train.drop('isFraud',axis=1), train.isFraud.values
X_va, y_va = valid.drop('isFraud',axis=1), valid.isFraud.values

In [5]:
# ── feature_blocks.py ─────────────────────────────────────────────────────────
import re, numpy as np, pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

EMAIL_MAP = {  # collapses tiny domains
    'gmail.com':'google', 'gmail':'google',
    'yahoo.com':'yahoo', 'yahoo.com.mx':'yahoo', 'ymail.com':'yahoo',
    'outlook.com':'microsoft', 'hotmail.com':'microsoft', 'live.com':'microsoft'
}
START_DATE = pd.to_datetime('2017-12-01')

class BasicTimeAmt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    
    def transform(self, X):
        X = X.copy()
        X['log_amt'] = np.log1p(X['TransactionAmt'])
        dt           = START_DATE + pd.to_timedelta(X['TransactionDT'], unit='s')
        X['dow']     = dt.dt.dayofweek.astype('int8')
        X['hour']    = dt.dt.hour.astype('int8')
        X['is_we']   = (X['dow']>=5).astype('int8')
        return X[['log_amt','dow','hour','is_we']]

class EmailDomain(BaseEstimator, TransformerMixin):
    def __init__(self, cols=('P_emaildomain', 'R_emaildomain')):  # noqa
        self.cols = cols

    def fit(self, X, y=None):  # nothing to learn
        return self

    def transform(self, X):
        X_new = pd.DataFrame(index=X.index)
        for c in self.cols:
            if c not in X.columns:                 # ← guard against KeyError
                continue
            
            tmp = (
                X[c].str.lower()
                    .str.extract(r'([A-Za-z0-9.-]+\.[A-Za-z]{2,4})', expand=False)
                    .map(EMAIL_MAP)
                    .fillna('other')
                    .astype('category')
            )
            X_new[c] = tmp
        return X_new

class CardFreqAmt(BaseEstimator, TransformerMixin):
    """Ratio of amount to historical mean per card1 + overall freq of card1."""
    def fit(self, X, y=None):
        self.card_mean_ = X.groupby('card1')['TransactionAmt'].mean()
        self.card_freq_ = X['card1'].value_counts(normalize=True)
        return self
    
    def transform(self, X):
        X = X.copy()
        X['card1_mean'] = X['card1'].map(self.card_mean_)
        X['amt_over_mean'] = X['TransactionAmt'] / X['card1_mean']
        X['card1_freq']  = X['card1'].map(self.card_freq_)
        return X[['amt_over_mean','card1_freq']]

# Assembling all blocks
class FeatureAssembler(BaseEstimator, TransformerMixin):
    """
    Add engineered features *on top of* the raw dataframe so downstream
    transformers still see the original columns.
    """
    def __init__(self):
        self.blocks = [BasicTimeAmt(), EmailDomain(), CardFreqAmt()]

    def fit(self, X, y=None):
        for b in self.blocks:
            b.fit(X, y)
        return self

    def transform(self, X):
        X_aug = X.copy()
        for b in self.blocks:
            X_aug = pd.concat([X_aug, b.transform(X)], axis=1)
        # optional: drop duplicate columns that might appear
        X_aug = X_aug.loc[:, ~X_aug.columns.duplicated()]
        return X_aug
    
class SparseDropper(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=0.75): 
        self.thresh = thresh

    def fit(self, X, y=None):
        self.keep_ = [c for c in X.columns if X[c].isna().mean() < self.thresh]
        return self
    
    def transform(self, X):
        return X[self.keep_]
    
class DropColumns(BaseEstimator, TransformerMixin):
    """Remove column if it is present."""
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):      # nothing to learn
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")


In [6]:
# Remove columns with >75 % NaN
nan_thresh = 0.75
keep_cols  = [c for c,p in (train.isna().mean()).items() if p < nan_thresh]

# Remove highly correlated numeric (>0.95)
corr = train[keep_cols].select_dtypes(include=['number']).corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
cor_cols = [c for c in upper.columns if any(upper[c] > .95)]
keep_cols = list(set(keep_cols) - set(cor_cols))

num_cols  = train[keep_cols].select_dtypes(include=['number']).columns.tolist()
cat_cols  = [c for c in keep_cols if c not in num_cols and c!='isFraud']

# Exclude target
target = "isFraud"
drops = ["TransactionID"] + [target, ]
nan_thresh = 0.75
num_cols  = [c for c in num_cols  if c not in drops]
cat_cols  = [c for c in cat_cols if c not in drops]

In [10]:
# Base parameters
base_params = {
    # preprocessing
    "missing_threshold"     : 0.75,
    "imputer_num_strategy"  : "median",
    "imputer_num_value"     : -999,
    "imputer_cat_strategy"  : "most_frequent",
    "scaler"                : "robust",          # robust / standard / none
    "use_freq_encoder"      : True,
    "feat_eng"              : True,
    "sparse_drop"           : True,

    # model family to run
    "model" : "xgb",                            # lgb / xgb

    # model-specific defaults
    "lgb_params": dict(
        n_estimators=600, num_leaves=64, max_depth=-1,
        subsample=0.9, colsample_bytree=0.7,
        learning_rate=0.05, objective="binary",
        class_weight="balanced", random_state=42, verbose=-1,
    ),
    "xgb_params": dict(
        n_estimators=2000, max_depth=12, learning_rate=0.02,
        subsample=0.8, colsample_bytree=0.5,
        eval_metric="auc", tree_method="hist", random_state=42,
        # scale_pos_weight will be filled from data
    ),
}

# Trials
trials = [
    {'name': 'trial-0', 'delta': {}},  # ➊ baseline – exactly base_params
    {'name': 'trial-1', 'delta': {"imputer_num_strategy": "constant", "imputer_num_value": -999}},
    {'name': 'trial-2', 'delta': {"feat_eng": False}},
    {'name': 'trial-3', 'delta': {"scaler": "standard"}},
    {'name': 'trial-4', 'delta': {"xgb_params": {"learning_rate": 0.05, "n_estimators": 1200}}},
    {'name': 'trial-5', 'delta': {"model": "lgb"}},                                   # switch model family
]


# Helper functions
def deep_update(orig: dict, updates: dict) -> dict:
    """Recursively merge *updates* into *orig* (returns a new dict)."""
    out = copy.deepcopy(orig)
    for k, v in updates.items():
        if isinstance(v, dict):
            out[k] = deep_update(out.get(k, {}), v)
        else:
            out[k] = v
    return out

def build_preproc(p: dict):
    # numeric
    imputer_num = SimpleImputer(
        strategy=p["imputer_num_strategy"], fill_value=p["imputer_num_value"]
    )
    scaler = {
        "robust":   RobustScaler(),
        "standard": StandardScaler(),
        "none":     "passthrough",
    }[p["scaler"]]
    num_pipe = Pipeline([("impute", imputer_num), ("scale", scaler)])

    # categorical
    cat_steps = []
    if p["use_freq_encoder"]:
        cat_steps.append(("freq", FrequencyEncoder()))
    cat_steps.append(("impute", SimpleImputer(strategy=p["imputer_cat_strategy"])))
    cat_pipe = Pipeline(cat_steps)

    return ColumnTransformer([
        ("num", num_pipe, selector(dtype_include=np.number)),
        ("cat", cat_pipe, selector(dtype_exclude=np.number)),
    ])

def build_model_pipe(p: dict):
    model_name = p["model"]
    if model_name == "lgb":
        clf = LGBMClassifier(**p["lgb_params"])
    elif model_name == "xgb":
        clf = XGBClassifier(**p["xgb_params"])
    else:
        raise ValueError("model must be 'lgb' or 'xgb'")

    steps = [
        ("drop_cols", DropColumns(["isFraud", "TransactionID"])),
    ]
    if p["feat_eng"]:
        steps.append(("feat", FeatureAssembler()))
    if p["sparse_drop"]:
        steps.append(("sparse", SparseDropper(p["missing_threshold"])))
    steps += [
        ("union", build_preproc(p)),
        ("vt",    VarianceThreshold(0.0)),
        ("clf",   clf),
    ]
    return Pipeline(steps)

def flatten_dict(d, parent_key="", sep="/"):
    """Turn nested dicts into a flat {key1/key2: value} form for MLflow."""
    items = {}
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_dict(v, new_key, sep=sep))
        else:
            items[new_key] = v
    return items

# class-imbalance factor for *all* xgb trials
base_spw = (len(train) - train.isFraud.sum()) / train.isFraud.sum()

# Experiment Tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("IEEE-CIS Trials")
mlflow.sklearn.autolog(log_input_examples=False, silent=True)

# Run trials
for trial in trials:
    run_name = trial['name']
    delta = trial['delta']

    print(f'Trial: {run_name}...')
    print(f'\tdelta: {delta}')
    p = deep_update(base_params, delta)

    if p["model"] == "xgb":
        p["xgb_params"]["scale_pos_weight"] = base_spw

    # Model training
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(flatten_dict(p))   # helper below

        print(f'\tBuilding pipeline...')
        pipe = build_model_pipe(p)

        print(f'\tFitting...')
        pipe.fit(X_tr, y_tr)

        print(f'\tEvaluating...')
        proba = pipe.predict_proba(X_va)[:, 1]
        auc   = roc_auc_score(y_va, proba)
        ap    = average_precision_score(y_va, proba)
        mlflow.log_metric("valid_auc", auc)
        mlflow.log_metric("valid_pr_auc", ap)

        print(f'\tLogging...')
        fpr, tpr, _ = roc_curve(y_va, proba)
        plt.figure(); plt.plot(fpr, tpr); plt.title("ROC"); mlflow.log_figure(plt.gcf(),"roc.png"); plt.close()
        prec, rec, _ = precision_recall_curve(y_va, proba)
        plt.figure(); plt.plot(rec, prec); plt.title("PR"); mlflow.log_figure(plt.gcf(),"pr.png"); plt.close()

        mlflow.sklearn.log_model(pipe, artifact_path="model")

        print(f"\tTrial {run_name}: AUC={auc:.4f} | PR-AUC={ap:.4f}")

2025/04/24 22:21:07 INFO mlflow.tracking.fluent: Experiment with name 'IEEE-CIS Trials' does not exist. Creating a new experiment.


Trial: trial-0...
	delta: {}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-0: AUC=0.9230 | PR-AUC=0.5826
🏃 View run trial-0 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/d081a3c81932419587069b25c627e222
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
Trial: trial-1...
	delta: {'imputer_num_strategy': 'constant', 'imputer_num_value': -999}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-1: AUC=0.9229 | PR-AUC=0.5823
🏃 View run trial-1 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/76441eae75a34c3c895162434da5b47c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
Trial: trial-2...
	delta: {'feat_eng': False}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-2: AUC=0.9186 | PR-AUC=0.5741
🏃 View run trial-2 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/771451b8c00e43c0bc40800429ca091d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
Trial: trial-3...
	delta: {'scaler': 'standard'}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-3: AUC=0.9230 | PR-AUC=0.5826
🏃 View run trial-3 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/c3b2567256a94186bbe938dd1292b7f7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
Trial: trial-4...
	delta: {'xgb_params': {'learning_rate': 0.05, 'n_estimators': 1200}}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-4: AUC=0.9217 | PR-AUC=0.5897
🏃 View run trial-4 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/1982c21d732c41dd9c0adbc669b7f706
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
Trial: trial-5...
	delta: {'model': 'lgb'}
	Building pipeline...
	Fitting...
	Evaluating...
	Logging...




	Trial trial-5: AUC=0.9160 | PR-AUC=0.5312
🏃 View run trial-5 at: http://127.0.0.1:5000/#/experiments/377797843707944316/runs/f11b648953ca48578863ef34da13b2e4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/377797843707944316
