In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

In [None]:
DATA_PATH = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [None]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
    return df


def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

**train dataset**

In [None]:
# train base table
tbt = pl.read_csv(DATA_PATH + "csv_files/train/train_base.csv")

# train credit bureau <take a_2_0 as example>
tcba20 = pl.read_csv(DATA_PATH + "csv_files/train/train_credit_bureau_a_2_1.csv").pipe(set_table_dtypes)

# train static 0 0
ts00 = pl.read_csv(DATA_PATH + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes)

# train static 0 1
ts01 = pl.read_csv(DATA_PATH  + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes)

# train static cb 0
tscb0 = pl.read_csv(DATA_PATH + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)

# train static person 1
tsp1 = pl.read_csv(DATA_PATH + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)   

# train credit bureau b 2
tcbb2 = pl.read_csv(DATA_PATH + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes)
	

# train dataset
# train static
tr_st = pl.concat([ts00, ts01], how="vertical_relaxed")

**test** **dataset**

In [None]:
# test base table
testbt = pl.read_csv(DATA_PATH + "csv_files/test/test_base.csv")

# test static 0 x
tests00 = pl.read_csv(DATA_PATH + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes)

# test static 0 x
tests01 = pl.read_csv(DATA_PATH + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes)

# test static 0 x
tests02 = pl.read_csv(DATA_PATH + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes)

# test static cb 0
testscb0 = pl.read_csv(DATA_PATH + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)

# test static person 1
testsp1 = pl.read_csv(DATA_PATH + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)   

# test credit bureau b 2
testcbb2 = pl.read_csv(DATA_PATH + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)
	
    
    
# test static
test_st = pl.concat([tests00, tests01, tests02], how="vertical_relaxed")

**Menyala abangkuuuu 🔥🔥🔥🔥🔥**

In [None]:
tsp1_feats_1 = tsp1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

tsp1_feats_2 = tsp1.select(["case_id", "num_group1", "housetype_905L"]).filter(pl.col("num_group1")==0).drop("num_group1").rename({"housetype_905L" : "personal_housetype"})


# Here we have num_goup1 and num_group2, so we need to aggregate again.
tcb_b_2_feats = tcbb2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

selected_static_cols = []
for col in tr_st.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
        
selected_static_cb_cols = []
for col in tscb0.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)

data_train = tbt.join(
    tr_st.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    tscb0.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(tsp1_feats_1, how="left", on="case_id"
      ).join(tsp1_feats_2, how="left", on="case_id"
            ).join(tcb_b_2_feats, how="left", on="case_id")


In [None]:
testp1_feats_1 = testsp1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

testp1_feats_2 = testsp1.select(["case_id", "num_group1", "housetype_905L"]).filter(pl.col("num_group1")==0).drop("num_group1").rename({"housetype_905L" : "personal_housetype"})


# Here we have num_goup1 and num_group2, so we need to aggregate again.
testcb_b_2_feats = testcbb2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)


data_test = testbt.join(
    test_st.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    testscb0.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(testp1_feats_1, how="left", on="case_id"
      ).join(testp1_feats_2, how="left", on="case_id"
            ).join(testcb_b_2_feats, how="left", on="case_id")

print(data_test.columns)

In [None]:
print(data_test.columns)
print("###############")
print(data_train.columns)

In [None]:
case_ids = data_train["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)
cols_pred = []

for col in data_train.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)
print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data_train.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data_train.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data_train.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

In [None]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

**lightgbm**

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}
gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

evaluation auc

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred
print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std
stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)
print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  

**submission** 

In [None]:
X_submission = data_test[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns
for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)
y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [None]:
submission = pd.DataFrame({
    "case_id": data_test["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")