In [None]:
! pip install tensorflow keras --upgrade --no-index --find-links /kaggle/input/package

In [8]:
import json
from typing import Dict

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import log_loss, auc, roc_auc_score

pd.set_option('future.no_silent_downcasting', True)


class XGBoost:
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        self.transformations_by_feature = transformations_by_feature
        self.model = None

    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation['properties']

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)

            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                df[col] = df[col].map(encoding_dict.get)

            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                df[col] = (df[col] - prop['mean']) / prop['stddev']
            else:
                pass

        return df

    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        print('Preprocessing...')
        df = self._preprocess(df)
        val_df = self._preprocess(val_df)

        print(df.head(50))

        print('Fitting...')
        train_mat = xgb.DMatrix(df.values, label_array)
        val_mat = xgb.DMatrix(val_df.values, val_label_array)
        evals = [(train_mat, 'train'), (val_mat, 'eval')]

        # negative : positive = 30 : 1
        base_param = {
            'learning_rate': 0.1,
            'tree_method': 'exact',
            'refresh_leaf': True,
            'max_depth': 5,
            'gamma': 0.6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'objective': 'binary:logistic',
            'eval_metric': ['logloss', 'auc'],
            'scale_pos_weight': 30,
            'reg_lambda': 3
        }
        update_param = base_param | {'updater': 'refresh', 'process_type': 'update'}
        params = base_param if self.model is None else update_param
        self.model = xgb.train(params, dtrain=train_mat, evals=evals, num_boost_round=400, early_stopping_rounds=100, xgb_model=self.model)

    def _preprocess_predict(self, df: pd.DataFrame):
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation['properties']

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)
            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                df[col] = df[col].map(encoding_dict.get)
            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                df[col] = (df[col] - prop['mean']) / prop['stddev']
            else:
                pass
        return df

    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        df_without_label = self._preprocess_predict(df_without_label)
        test_mat = xgb.DMatrix(df_without_label.values)
        pred = self.model.predict(test_mat)

        loss, auroc = None, None
        if label_array is not None:
            loss = log_loss(label_array, pred)
            auroc = roc_auc_score(label_array, pred)

        return pred, loss, auroc

    def save(self, output_model_path: str, output_transformation_path: str):
        self.model.save_model(output_model_path)
        with open(output_transformation_path, 'w') as fd:
            json.dump(self.transformations_by_feature, fd)


    def load(self, input_model_path: str, input_transformation_path: str):
        self.model = xgb.Booster()
        self.model.load_model(input_model_path)

        with open(input_transformation_path, 'r') as fd:
            self.transformations_by_feature = json.load(fd)



In [9]:
import polars as pl
from glob import glob

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
#     if depth in [1,2]:
#         df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
#         if depth in [1, 2]:
#             df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [10]:
def to_pandas(df_data):
    return df_data.to_pandas()

In [31]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

In [35]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [36]:
import numpy as np

class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df
    
    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df
    
class Aggregator:
    #Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [37]:
import gc
from pathlib import Path

ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

df_test = feature_eng(**data_store)

gc.collect()
print("test data shape:\t", df_test.shape)
del data_store

df_test = to_pandas(df_test).groupby('case_id').first().reset_index(drop=False)
df_test = reduce_mem_usage(df_test)
# print(df_test.head())

gc.collect()

test data shape:	 (149, 487)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 38.9%


0

In [39]:
import os
import kagglehub

path = kagglehub.model_download("josh9191/homecredit_xgboost/other/xgboost")
model_path = "/kaggle/input/homecredit_xgboost/other/xgboost/3/xgboost.json"
preprocess_json_path = "/kaggle/input/homecredit_xgboost/other/xgboost/3/preprocess.json"

xgboost_model = XGBoost()
xgboost_model.load(model_path, preprocess_json_path)

columns=["case_id", "pctinstlsallpaidlate1d_3546856L", "pmts_dpd_1073P", "numberofoverdueinstlmax_1151L", "overdueamountmax2_398A", "pmts_dpd_303P", "pmts_overdue_1140A", "maxdpdtolerance_577P", "daysoverduetolerancedd_3976961L", "dpdmax_139P", "pctinstlsallpaidlate6d_3546844L", "pmts_overdue_1152A", "overdueamountmax_155A", "overdueamountmax2_14A", "numinstlswithdpd10_728L", "maxdpdlast12m_727P", "numinstlswithdpd5_4187116L", "numrejects9m_859L", "maxdbddpdtollast12m_3658940P", "dpdmaxdateyear_596T", "numberofoverdueinstlmax_1039L", "overdueamountmaxdateyear_2T", "maxdpdlast3m_392P", "maxdpdfrom6mto36m_3546853P", "debtoverdue_47A", "pctinstlsallpaidearl3d_427L", "maxdbddpdtollast6m_4187119P", "numinstpaidearly_338L", "days180_256L", "numberofoverdueinstls_725L", "avgdbddpdlast3m_4187120P", "numcontrs3months_479L", "days30_165L", "mobilephncnt_593L", "cntpmts24_3658933L", "totaldebtoverduevalue_178A", "overdueamount_659A", "pmtnum_254L", "avgmaxdpdlast9m_3716943P", "pmts_year_507T", "residualamount_856A", "numinstregularpaidest_4493210L", "numinstpaidlate1d_3546852L", "MONTH", "collater_valueofguarantee_1124L", "numberofinstls_320L", "totalsettled_863A", "amtinstpaidbefduel24m_4187115A", "mindbddpdlast24m_3658935P", "disbursedcredamount_1113A", "totalamount_996A", "applicationscnt_867L", "maxdebt4_972A", "tenor_203L", "revolvingaccount_394A", "num_group2", "cntincpaycont9m_3716944L", "numinsttopaygrest_4493213L", "inittransactionamount_650A", "numinstpaidlastcontr_4325080L", "maxdpdinstlnum_3546846P", "childnum_21L", "clientscnt_946L", "maxpmtlast3m_4525190A", "nominalrate_281L", "interesteffectiverate_369L", "secondquarter_766L", "for3years_584L", "dpdmaxdateyear_742T", "forweek_528L", "numactiverelcontr_750L", "posfstqpd30lastmonth_3976962P", "debtoutstand_525A", "instlamount_892A", "collater_valueofguarantee_876L", "monthlyinstlamount_332A", "clientscnt_360L", "outstandingamount_354A", "avgoutstandbalancel6m_4187114A", "equalityempfrom_62L", "numberofinstls_229L", "forquarter_634L", "maxannuity_4075009A", "avglnamtstart24m_4525187A", "maxoutstandbalancel12m_4187113A", "posfpd30lastmonth_3976960P", "installmentamount_833A", "currdebtcredtyperange_828A", "thirdquarter_1082L", "totalamount_503A", "lastrejectreasonclient_4145040M", "birth_259D", "lastcancelreason_561M", "lastrejectreason_759M", "incometype_1044T", "overdueamountmax2date_1002D", "subjectrole_93M", "employedfrom_700D", "sex_738L", "collaterals_typeofguarante_359M", "lastst_736L", "responsedate_4917613D", "education_1138M", "birthdate_574D", "assignmentdate_4527235D", "firstdatedue_489D", "validfrom_1069D", "datelastinstal40dpd_247D", "subjectroles_name_838M", "firstclxcampaign_1125D", "maritalst_385M", "lastapprcommoditycat_1041M", "collaterals_typeofguarante_669M", "language1_981M", "numberofoverdueinstlmaxdat_641D", "education_927M", "lastdelinqdate_224D", "dateactivated_425D", "dtlastpmt_581D", "empladdr_zipcode_114M", "dtlastpmtallstes_4499206D", "description_351M", "classificationofcontr_1114M", "conts_type_509L", "purposeofcred_722M", "credtype_587L", "name_4527232M", "postype_4733339M", "contractenddate_991D", "responsedate_1012D", "openingdate_313D", "dateofcredend_353D", "dateofcredend_289D", "dateofrealrepmt_138D", "payvacationpostpone_4187118D", "district_544M", "subjectrole_43M", "lastrejectcommoditycat_161M", "lastrepayingdate_696D", "credacc_status_367L", "assignmentdate_238D", "gender_992L", "contractmaturitydate_151D", "financialinstitution_382M", "financialinstitution_591M", "registaddr_zipcode_184M", "dtlastpmtallstes_3545839D", "cacccardblochreas_147M", "cancelreason_3545846M", "collater_typofvalofguarant_298M", "maritalst_893M"]

df_test_selected = df_test[columns]
case_id = df_test_selected['case_id']

preds, _, _ = xgboost_model.predict(df_test_selected.drop(columns=['case_id']))
submission = pd.DataFrame({
    "case_id": case_id,
    "score": preds
}).set_index('case_id')
submission.to_csv("./submission.csv")

Attaching model 'josh9191/homecredit_xgboost/other/xgboost' to your Kaggle notebook...
