In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
from argparse import Namespace
from pathlib import Path
import json
import sys
import gc
LIB_PATH = '/kaggle/input/hc-hero'
sys.path.append(LIB_PATH)

from dataset.feature.preprocessor import Preprocessor
from dataset.feature.feature_loader import FeatureLoader
from dataset.const import TOPICS
from dataset.datainfo import RawInfo, RawReader
from dataset.feature.util import optimize_dataframe
import lightgbm as lgb


In [3]:
def read_json(path: str):
    with open(path, 'r') as f:
        return json.load(f)

# load model
MODEL_NAME = 'small_feature'
OUTPUT_PATH = Path(LIB_PATH) / 'output'
MODEL_PATH = OUTPUT_PATH / 'model' / MODEL_NAME
FEATURE_DEF_PATH = OUTPUT_PATH / 'feature_definition'

model = lgb.LGBMClassifier()
model = lgb.Booster(model_file=MODEL_PATH / 'model.pkl')
artifacts = read_json(MODEL_PATH / 'artifacts.json')

#configs
type_ = 'test'
prep_data_path = '/kaggle/working/preps'
conf = Namespace(
    **{"data_path": '/kaggle/input/home-credit-credit-risk-model-stability',
        "prep_data_path": prep_data_path,
        "raw_format": "parquet",
        "output_path": OUTPUT_PATH,
        "feature_def_path": FEATURE_DEF_PATH
      })

In [4]:
%%time
# preprocess
prep = Preprocessor(type_, conf=conf)
prep.preprocess()
del prep

[+] Preprocessing applprev, depth=2
[+] Preprocessing credit_bureau_a, depth=2
[+] Preprocessing credit_bureau_b, depth=2
[+] Preprocessing person, depth=2
[+] Memory optimization debitcard
[+] Memory optimization deposit
[+] Memory optimization other
[+] Memory optimization tax_registry_a
[+] Memory optimization tax_registry_b
[+] Memory optimization tax_registry_c
[+] Memory optimization static
[+] Memory optimization static_cb
CPU times: user 4.1 s, sys: 295 ms, total: 4.4 s
Wall time: 4.79 s


In [5]:
# special processing for test dataset
if type_ == 'test':
    tax_c = pl.read_parquet(f'{prep_data_path}/{type_}/{type_}_tax_registry_c_1.parquet')
    tax_c = tax_c.with_columns(pl.col('pmtamount_36A').cast(pl.Int16))
    tax_c.write_parquet(f'{prep_data_path}/{type_}/{type_}_tax_registry_c_1.parquet')
    del tax_c
    gc.collect()

In [6]:
%%time
# load features

raw_info = RawInfo(conf)
selected = artifacts['features']

base = raw_info.read_raw('base', reader=RawReader('polars'), type_=type_)
base = base.select([pl.col('case_id').cast(pl.Int32), 'date_decision'])

depth0_topics = [topic for topic in TOPICS if topic.depth == 0]
for topic in depth0_topics:
    print(f'[*] Processing {topic.name}...')
    data = raw_info.read_raw(topic.name, reader=RawReader('polars'), type_=type_)
    data = optimize_dataframe(data)
    data = data.select([c for c in data.columns if c in selected + ['case_id', 'date_decision']])
    base = base.join(data, on='case_id', how='left')
del data
gc.collect()
    
depth1_topics = [topic for topic in TOPICS if topic.depth == 1]
for topic in depth1_topics:
    print(f'[*] Processing {topic.name}...')
    fl = FeatureLoader(topic, type=type_, conf=conf)
    features = fl.load_features(selected)
    for data in fl.load_feature_data_batch(features, 32):
        dup_keyword = '_if_1_eq_1_then_num_group1_'
        dupable_col = [c for c in data.columns if dup_keyword in c]
        data = data.rename({col: f'{col}_{topic.name}' for col in dupable_col})
        data = optimize_dataframe(data)
        base = base.join(data, on='case_id', how='left')
        del data
    del fl
    gc.collect()

[*] Processing static...
[*] Processing static_cb...
[*] Processing applprev...


100%|██████████| 2/2 [00:00<00:00, 108.75it/s]


[*] Elapsed time: 0.0218 sec
[*] Processing credit_bureau_a...


100%|██████████| 5/5 [00:00<00:00, 126.52it/s]


[*] Elapsed time: 0.0427 sec
[*] Processing credit_bureau_b...


100%|██████████| 1/1 [00:00<00:00, 327.35it/s]


[*] Elapsed time: 0.0057 sec
[*] Processing person...


100%|██████████| 3/3 [00:00<00:00, 148.02it/s]

[*] Elapsed time: 0.0231 sec





[*] Processing debitcard...


0it [00:00, ?it/s]

[*] Elapsed time: 0.0026 sec





[*] Processing deposit...


100%|██████████| 1/1 [00:00<00:00, 259.60it/s]

[*] Elapsed time: 0.0066 sec





[*] Processing other...


0it [00:00, ?it/s]

[*] Elapsed time: 0.0050 sec
[*] Processing tax_registry_a...



100%|██████████| 2/2 [00:00<00:00, 129.83it/s]

[*] Elapsed time: 0.0179 sec
[*] Processing tax_registry_b...



100%|██████████| 1/1 [00:00<00:00, 197.56it/s]


[*] Elapsed time: 0.0076 sec
[*] Processing tax_registry_c...


100%|██████████| 2/2 [00:00<00:00, 116.53it/s]

[*] Elapsed time: 0.0209 sec
CPU times: user 1.6 s, sys: 33.9 ms, total: 1.63 s
Wall time: 1.74 s





In [7]:
# date(string) to period(number)
date_cols = [c for c in base.columns if (c.startswith('max__if') and c.endswith('d__')) or c.endswith('D')]
for c in date_cols:
    base = base.with_columns(
        ((pl.col('date_decision').cast(pl.Date) - pl.col(c).cast(pl.Date)).fill_null(0).cast(pl.Int64) / 86400000).alias(c)
    )

In [8]:
submission_df = base.select('case_id').to_pandas()
base = base.drop(['case_id']).to_pandas()
base = base[artifacts['features']]
base = base.astype({c: 'category' for i, c in enumerate(base.columns) if i in artifacts['cat_indicis']})
base = base.astype({c: 'float' for c in base.dtypes[base.dtypes=='O'].index})
submission_df["score"] = model.predict(base)
submission_df.to_csv("/kaggle/working/submission.csv", index=False)

In [9]:
submission_df

Unnamed: 0,case_id,score
0,57543,0.213962
1,57549,0.274439
2,57551,0.143161
3,57552,0.366452
4,57569,0.463389
5,57630,0.145593
6,57631,0.430424
7,57632,0.301999
8,57633,0.489817
9,57634,0.40575
