xgb

In [None]:
import configparser

import numpy as np, xgboost as xgb, pandas as pd, modin.pandas as md
import snowflake.snowpark.modin.plugin
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

from xgboost_helpers import (
    load_data,
    split_numeric_categorical,
    build_dmatrix,
    OrdinalEncoder,
    NumericImputerArbitrary,
    RichProgressBarCallback,
)

from snowflake.snowpark.context import get_active_session
np.int = int
session = get_active_session()
session.use_schema("ML_AUTOMATION")

In [None]:
config_file = (r"confin.ini")
config = configparser.ConfigParser()
config.read(config_file)

In [None]:
df_train = md.read_snowflake(name_or_query=config["PATHS"].get("train", ""))
df_test = md.read_snowflake(name_or_query=config["PATHS"].get("test", ""))

In [None]:
y_col = config["VARIABLES"].get("y_col", "")
identifier_str = config["VARIABLES"].get("identifier_cols", "")
identifier_cols = [col.strip() for col in identifier_str.split(",") if col.strip()]
df_identifiers = [c for c in df_train.columns if c in identifier_cols]
df_test_identifiers = [c for c in df_test.columns if c in identifier_cols]
df_test_identifiers = df_test[df_test_identifiers].copy()

In [None]:
modelling_features = [
    'a',
    'b',
    'c'
]

preprocessing

In [None]:
valid = [feature for feature in modelling_features if feature in df_train.columns]

In [None]:
y_col = df_train[y_col].copy()
df_train = df_train.loc[:, valid]

In [None]:
num_feats, cat_feats = split_numeric_categorical(df_train, valid)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    [
        ("cat", OrdinalEncoder(columns=cat_feats, method="freq"), cat_feats),
        ("num", NumericImputerArbitrary(), num_feats),
    ]
)

In [None]:
df_train = df_train._to_pandas()
df_test = df_test._to_pandas()

In [None]:
preprocessor.fit(df_train)

In [None]:
X_train_transformed = preprocessor.transform(df_train)
X_test_transformed = preprocessor.transform(df_test)

In [None]:
X_train_transformed_df = pd.DataFrame(
    X_train_transformed, columns=num_feats + cat_feats
)

In [None]:
df_train = pd.DataFrame(
    X_train_transformed, columns=X_train_transformed_df.columns, index=df_train.index
)
df_test = pd.DataFrame(X_test_transformed, columns=X_train_transformed_df.columns, index=df_test.index)

## modelling

In [None]:
# Access sections
params = config["PARAMS"]

# Build the search space
search_space = [
    Integer(4, 7, name="max_depth"),
    Real(0.1, 0.2, prior="log-uniform", name="eta"),  # eta == learning_rate
    Real(0.5, 1.0, prior="uniform", name="subsample"),
    Real(0.0, 10.0, prior="uniform", name="reg_alpha"),
    Real(0.0, 10.0, prior="uniform", name="reg_lambda"),
    Integer(500, 3000, name="n_estimators"),
    Real(0.0, 1.0, prior="uniform", name="colsample_bytree"),
    Categorical(["hist", "approx", "gpu_hist"], name="tree_method"),
]

# or read direct from config
default_params = {
    k: eval(v) if v.replace(".", "", 1).isdigit() else v
    for k, v in params.items()
}

print("default params loaded:")
for k, v in default_params.items():
    print(f"{k}: {v}")

print("\nsearch space:")
for p in search_space:
    print(p)

In [None]:
@use_named_args(search_space)
def objective(**params_opt):
    xgb_params = {
        "objective": config["PARAMS"].get("objective"),
        "eta": params_opt["learning_rate"],
        "max_depth": params_opt["max_depth"]
    }

    dtrain_cv = build_dmatrix(df_train, y_col)
    cv_results = xgb.cv(
        params=xgb_params,
        dtrain=dtrain_cv,
        num_boost_round=params_opt["n_estimators"],
        nfold=3,
        metrics=("rmse"),
        early_stopping_rounds=20,
        verbose_eval=True,
        seed=42,
    )
    return cv_results["test-rmse-mean"].min()

In [None]:
result = gp_minimize(objective, search_space, n_calls=30, random_state=42)

In [None]:
print("best hyperparameters:")
for name, val in zip([dim.name for dim in search_space], result.x):
    print(f"{name}: {val}")

In [None]:
params = {}
params_section = config["PARAMS"]

In [None]:
n_estimators = params_section.getint("n_estimators", 2254)

In [None]:
dtrain = build_dmatrix(df_train, y_col)
rich_callback = RichProgressBarCallback(total_rounds=n_estimators)

## final model training

In [None]:
print("training...")
model = xgb.train(
    params, dtrain, num_boost_round=n_estimators, callbacks=[rich_callback]
)
print("train complete")

## score

In [None]:
dtest = build_dmatrix(df_test, None)

In [None]:
preds = model.predict(dtest)

In [None]:
results = df_test_identifiers.copy()
results["prediction"] = preds
output_path = config["PATHS"].get("predictions_csv", "xgb_test_predictions.csv")
results.to_csv(output_path, index=False)

In [None]:
output = session.write_pandas(results, "snowpark_jc", auto_create_table=True, table_type="transient")