In [1]:
import json
from typing import Dict

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import log_loss, auc, roc_auc_score

pd.set_option('future.no_silent_downcasting', True)


class XGBoost:
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        self.transformations_by_feature = transformations_by_feature
        self.model = None

    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation['properties']

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)

            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                df[col] = df[col].map(encoding_dict.get)

            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                df[col] = (df[col] - prop['mean']) / prop['stddev']
            else:
                pass

        return df

    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        print('Preprocessing...')
        df = self._preprocess(df)
        val_df = self._preprocess(val_df)

        print('Fitting...')
        train_mat = xgb.DMatrix(df.values, label_array)
        val_mat = xgb.DMatrix(val_df.values, val_label_array)
        evals = [(train_mat, 'train'), (val_mat, 'eval')]

        # negative : positive = 30 : 1
        base_param = {
            'learning_rate': 0.1,
            'tree_method': 'exact',
            'refresh_leaf': True,
            'max_depth': 5,
            'gamma': 0.6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'objective': 'binary:logistic',
            'eval_metric': ['logloss', 'auc'],
            'scale_pos_weight': 30,
            'reg_lambda': 3
        }
        update_param = base_param | {'updater': 'refresh', 'process_type': 'update'}
        params = base_param if self.model is None else update_param

        boosting_rounds = 400 if self.model is None else self.model.num_boosted_rounds()
        self.model = xgb.train(params, dtrain=train_mat, evals=evals, num_boost_round=boosting_rounds, early_stopping_rounds=100, xgb_model=self.model)

    def _preprocess_predict(self, df: pd.DataFrame):
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation['properties']

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)
            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                encoded = df[col].map(encoding_dict.get)
                df = df.drop(columns=[col])
                df.loc[:, col] = encoded
            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                standardized = (df[col] - prop['mean']) / prop['stddev']
                df = df.drop(columns=[col])
                df.loc[:, col] = standardized
            else:
                pass
        return df

    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        batch_size = 4096
        chunked_dfs = [df_without_label[i:i + batch_size].reset_index(drop=True) for i in range(0, len(df_without_label), batch_size)]

        preds = []
        for i in range(len(chunked_dfs)):
            chunked_df = self._preprocess_predict(chunked_dfs[i])
            test_mat = xgb.DMatrix(chunked_df.values)
            preds.append(self.model.predict(test_mat))
            # memory efficient way
            chunked_dfs[i] = None

        loss, auroc = None, None
        pred = np.array([y for x in preds for y in x])
        if label_array is not None:
            loss = log_loss(label_array, pred)
            auroc = roc_auc_score(label_array, pred)

        return pred, loss, auroc

    def save(self, output_model_path: str, output_transformation_path: str):
        self.model.save_model(output_model_path)
        with open(output_transformation_path, 'w') as fd:
            json.dump(self.transformations_by_feature, fd)


    def load(self, input_model_path: str, input_transformation_path: str):
        self.model = xgb.Booster()
        self.model.load_model(input_model_path)

        with open(input_transformation_path, 'r') as fd:
            self.transformations_by_feature = json.load(fd)



In [2]:
import polars as pl
import pyarrow.parquet as pq
from glob import glob

def read_file(path, depth=None, columns=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df, columns))
    return df


def read_files(regex_path, depth=None, columns=None):
    chunks = []

    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df, columns))
        chunks.append(df)

    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [3]:
def to_pandas(df_data):
    return df_data.to_pandas()

In [4]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type) == "category":
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
import numpy as np

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  # !!?
                df = df.with_columns(pl.col(col).dt.total_days())  # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df


class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = Aggregator.get_max_cols(df, cols, columns)
        expr_min = Aggregator.get_min_cols(df, cols, columns)
        expr_last = Aggregator.get_last_cols(df, cols, columns)
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = Aggregator.get_mean_cols(df, cols, columns)
        return expr_max + expr_min + expr_last + expr_mean

    def date_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = Aggregator.get_max_cols(df, cols, columns)
        expr_min = Aggregator.get_min_cols(df, cols, columns)
        expr_last = Aggregator.get_last_cols(df, cols, columns)
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = Aggregator.get_mean_cols(df, cols, columns)
        return expr_max + expr_min + expr_last + expr_mean

    def str_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = Aggregator.get_max_cols(df, cols, columns)
        expr_min = Aggregator.get_min_cols(df, cols, columns)
        expr_last = Aggregator.get_last_cols(df, cols, columns)
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_count = Aggregator.get_count_cols(df, cols, columns)
        return expr_max + expr_min + expr_last + expr_count

    def other_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = Aggregator.get_max_cols(df, cols, columns)
        expr_min = Aggregator.get_min_cols(df, cols, columns)
        expr_last = Aggregator.get_last_cols(df, cols, columns)
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_min + expr_last

    def count_expr(df, columns=None):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = Aggregator.get_max_cols(df, cols, columns)
        expr_min = Aggregator.get_min_cols(df, cols, columns)
        expr_last = Aggregator.get_last_cols(df, cols, columns)
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_min + expr_last
    
    def filter_columns(dfs, columns):
        if columns is None or len(columns) == 0:
            return dfs
        return [df for df in dfs if df.meta.output_name() in columns or any(col.startswith(df.meta.output_name()) for col in columns)]

    def get_exprs(df, columns):
        exprs = Aggregator.num_expr(df, columns) + \
                Aggregator.date_expr(df, columns) + \
                Aggregator.str_expr(df, columns) + \
                Aggregator.other_expr(df, columns) + \
                Aggregator.count_expr(df, columns)

        return exprs
    
    def get_max_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.max(col).alias(f"max_{col}") for col in target_cols], cols)
    
    def get_min_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.min(col).alias(f"min_{col}") for col in target_cols], cols)
    
    def get_last_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.last(col).alias(f"last_{col}") for col in target_cols], cols)
    
    def get_count_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.count(col).alias(f"count_{col}") for col in target_cols], cols)
        
    def get_first_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.first(col).alias(f"first_{col}") for col in target_cols], cols)
    
    def get_mean_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.mean(col).alias(f"mean_{col}") for col in target_cols], cols)

In [10]:
import gc
from pathlib import Path

import os
import kagglehub

path = kagglehub.model_download("josh9191/homecredit_xgboost/other/xgboost")
model_path = "/kaggle/input/homecredit_xgboost/other/xgboost/5/best_model"
preprocess_json_path = "/kaggle/input/homecredit_xgboost/other/xgboost/5/preprocess.json"

xgboost_model = XGBoost()
xgboost_model.load(model_path, preprocess_json_path)

columns = list(xgboost_model.transformations_by_feature.keys())

ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet", None, columns),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet", None, columns),
        read_files(TEST_DIR / "test_static_0_*.parquet", None, columns),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1, columns),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1, columns),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_other_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_person_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1, columns),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2, columns),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2, columns),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2, columns),
        read_file(TEST_DIR / "test_person_2.parquet", 2, columns)
    ]
}

# data_store = {
#     "df_base": read_file(TRAIN_DIR / "train_base.parquet", None, columns),
#     "depth_0": [
#         read_file(TRAIN_DIR / "train_static_cb_0.parquet", None, columns),
#         read_files(TRAIN_DIR / "train_static_0_*.parquet", None, columns),
#     ],
#     "depth_1": [
#         read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1, columns),
#         read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_other_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_person_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_deposit_1.parquet", 1, columns),
#         read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1, columns),
#     ],
#     "depth_2": [
#         read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2, columns),
#         read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2, columns),
#     ]
# }

df_test = feature_eng(**data_store)

print("test data shape:\t", df_test.shape)
del data_store
gc.collect()

df_test = to_pandas(df_test).reset_index(drop=False)
df_test = reduce_mem_usage(df_test)

Attaching model 'josh9191/homecredit_xgboost/other/xgboost' to your Kaggle notebook...


test data shape:	 (10, 334)
Memory usage of dataframe is 0.03 MB
Memory usage after optimization is: 0.01 MB
Decreased by 48.4%


In [11]:
df_test_selected = df_test[['case_id'] + columns].copy()
del df_test
gc.collect()
case_id = df_test_selected['case_id']

preds, _, _ = xgboost_model.predict(df_test_selected.drop(columns=['case_id']))
submission = pd.DataFrame({
    "case_id": case_id,
    "score": np.clip(np.nan_to_num(preds, nan=0.3), 0, 1)
}).groupby('case_id').mean()
# print(submission)
submission.to_csv("./submission.csv")