In [30]:
from pathlib import Path
from rich import print

import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
import wandb
from wandb.lightgbm import wandb_callback, log_summary

In [42]:
class Config:
    exp = "001"
    ver = "001"

    project_name = f"homecredit-{exp}-{ver}"
    model_dir = Path(f"/kaggle/input/{project_name}")

    train_dir = Path(
        "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train"
    )
    test_dir = Path(
        "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test"
    )

    file_names = [
        "static_cb_0",
        "static_0",
        "debitcard_1",
        "other_1",
        "applprev_1",
        "person_1",
        "deposit_1",
        "credit_bureau_a_1",
        "credit_bureau_b_1",
        "tax_registry_a_1",
        "tax_registry_b_1",
        "tax_registry_c_1",
        "applprev_2",
        "person_2",
        "credit_bureau_a_2",
        "credit_bureau_b_2",
    ]

    is_debug = True
    debug_size = 1000

    params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": "cpu", 
    "is_training_metric": True,
}

In [16]:
class Aggregator:
    def num_expr(df: pl.DataFrame):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max + expr_min + expr_last + expr_mean

    def date_expr(df: pl.DataFrame):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max + expr_min + expr_last + expr_mean

    def str_expr(df: pl.DataFrame):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return expr_max + expr_last

    def other_expr(df: pl.DataFrame):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def count_expr(df: pl.DataFrame):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def all_exprs(df: pl.DataFrame) -> list[pl.Expr]:
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)
        return exprs

In [17]:
DataStore = dict[str, list[pl.DataFrame]]


class Pipeline:
    def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).str.strptime(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).str.strptime(pl.Date))
        return df

    def handle_dates(df: pl.DataFrame):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                # df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
                df = df.with_columns(pl.col(col).dt.days())  # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df: pl.DataFrame):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (
                df[col].dtype == pl.Utf8
            ):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        return df


def feature_eng_base(df_base: pl.DataFrame):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    return df_base


def read_file(file_name: str, depth: int | None = None, is_train: bool = True):
    dir = Config.train_dir if is_train else Config.test_dir
    n_rows = Config.debug_size if Config.is_debug else None
    chunks = []
    for path in dir.glob("*_" + file_name + "*.parquet"):
        df = pl.read_parquet(path, n_rows=n_rows)
        df: pl.DataFrame = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.all_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df


def preprocessing(is_train: bool = True):
    df_base = read_file("base")
    df_base = feature_eng_base(df_base)

    for i, file_name in enumerate(Config.file_names):
        depth = int(file_name[-1])
        df = read_file(file_name, depth)
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base: pl.DataFrame = df_base.pipe(Pipeline.handle_dates)
    return df_base


In [18]:
def to_pandas(df_data: pl.DataFrame, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols


def gini_stability(base: pd.DataFrame):
    w_fallingrate = 88.0
    w_resstd = -0.5

    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [None]:
Config.model_dir.mkdir(parents=True, exist_ok=True)

df = preprocessing()
df, _ = to_pandas(df)
kf = StratifiedGroupKFold(n_splits=2)
base = df[["case_id", "WEEK_NUM", "target"]]
base["score"] = 0
X = df.drop(["case_id", "WEEK_NUM", "target"], axis=1)
y = df["target"]
for i, (train_index, valid_index) in enumerate(kf.split(X, y, groups=base["WEEK_NUM"])):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    base_train = base.iloc[train_index]

    X_valid = X.iloc[valid_index]
    y_valid = y.iloc[valid_index]
    base_valid = base.iloc[valid_index]
    

    train_ds = lgb.Dataset(X_train, y_train)
    valid_ds = lgb.Dataset(X_valid, y_valid)

    gbm = lgb.train(
        Config.params, train_ds, valid_sets=[train_ds, valid_ds],
    )

    y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    base.loc[valid_index, "score"] = y_pred
    print(f"stability score fold {i}: {gini_stability(base.iloc[valid_index])}")
    
    gbm.save_model(Config.model_dir / f"gbm_{i}.txt")
print(f"stability score: {gini_stability(base)}")
