In [1]:
from __future__ import annotations

import datetime
import logging

import catboost
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from xgboost import XGBRegressor

logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level="INFO")
log = logging.getLogger("notebook")

In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
test_id_list = list(test_df["id"])

train_df = train_df.drop(columns=["id"])
test_df = test_df.drop(columns=["id"])

In [3]:
train_df["person_age_1"] = (
    train_df["person_age"].clip(lower=0, upper=99).map(lambda x: x // 10).value_counts()
)
test_df["person_age_1"] = test_df["person_age"].clip(lower=0, upper=99).map(lambda x: x // 10).value_counts()

In [4]:
cat_features = ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]
num_features = [
    'person_age',
    'person_age_1',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
]

## Hyper parameter tunning with Optuna

In [None]:
# def objective(trial):
#     cv = StratifiedKFold(5, shuffle=True, random_state=9999)
#     param = {
#         "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#         "depth": trial.suggest_int("depth", 1, 12),
#         "eval_metric": "AUC",
#         "cat_features": cat_features,
#         "random_state": 9999,
#         "iterations": 1000,
#         "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-1, 1.0, log=True),
#     }
#     score_list = []
#     for train_idx, val_idx in cv.split(train_df, y=train_df["loan_status"]):
#         train_x = train_df.iloc[train_idx][cat_features + num_features]
#         train_y = train_df.iloc[train_idx]["loan_status"]
#         valid_x = train_df.iloc[val_idx][cat_features + num_features]
#         valid_y = train_df.iloc[val_idx]["loan_status"]
#         cb_clf = catboost.CatBoostClassifier(**param)
#         cb_clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=200)
#         valid_y_pred = cb_clf.predict_proba(valid_x)[:, 1]
#         score = roc_auc_score(valid_y, valid_y_pred)
#         score_list.append(score)
#     return np.mean(score_list)


# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=200)
# cb_clf_params = study.best_params

# # Best is trial 36 with value: 0.9553711593693648.
# # cb_clf_params = {
# #     'objective': 'Logloss',
# #     'colsample_bylevel': 0.0993013825516781,
# #     'depth': 5,
# #     'learning_rate': 0.29432288832602616,
# #     'l2_leaf_reg': 0.8531657930570534,
# # }

### Vanilla version

In [None]:
# cb_clf = catboost.CatBoostClassifier(
#     random_state=9999,
#     cat_features=cat_features,
#     eval_metric="AUC",
#     iterations=1000,
#     verbose=0,
#     **cb_clf_params,
# )
# cb_clf.fit(train_df[cat_features + num_features], train_df["loan_status"])
# test_y_pred = cb_clf.predict_proba(test_df[cat_features + num_features])[:, 1]
# pd.DataFrame({"id": test_id_list, "loan_status": test_y_pred}).to_csv(
#     "./data/catboost_vanilla.csv", index=False
# )

In [20]:
cv_pred_dict = {}
cv = StratifiedKFold(5, shuffle=True, random_state=9999)
# param = {
#     "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#     "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#     "depth": trial.suggest_int("depth", 1, 12),
#     "eval_metric": "AUC",
#     "cat_features": cat_features,
#     "random_state": 9999,
#     "iterations": 1000,
#     "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#     'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-1, 1.0, log=True),
# }

params = {
    'objective': 'Logloss',
    'colsample_bylevel': 0.0993013825516781,
    'depth': 5,
    'learning_rate': 0.29432288832602616,
    'l2_leaf_reg': 0.8531657930570534,
    "eval_metric": "AUC",
    "random_state": 9999,
    "iterations": 1000,
    "cat_features": cat_features,
}
score_list = []
for cv_idx, (train_idx, val_idx) in zip(range(5), cv.split(train_df, y=train_df["loan_status"])):
    train_x = train_df.iloc[train_idx][cat_features + num_features]
    train_y = train_df.iloc[train_idx]["loan_status"]
    valid_x = train_df.iloc[val_idx][cat_features + num_features]
    valid_y = train_df.iloc[val_idx]["loan_status"]
    cb_clf = catboost.CatBoostClassifier(**params)
    cb_clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=200)

    valid_y_pred = cb_clf.predict_proba(valid_x[cat_features + num_features])[:, 1]
    cv_score = roc_auc_score(valid_y, valid_y_pred)
    log.info("cv_score: %s", cv_score)

    test_y_pred = cb_clf.predict_proba(test_df[cat_features + num_features])[:, 1]
    cv_key = f"cv{cv_idx}"
    if cv_key not in cv_pred_dict:
        cv_pred_dict[cv_key] = test_y_pred
cv_pred_df = pd.DataFrame(cv_pred_dict)
cv_pred_df

2024-10-10 08:55:31,094 INFO cv_score: 0.9468490695316792
2024-10-10 08:55:38,840 INFO cv_score: 0.9529636819412175
2024-10-10 08:55:44,145 INFO cv_score: 0.9449751257996979
2024-10-10 08:55:52,146 INFO cv_score: 0.9484012589196794
2024-10-10 08:56:00,205 INFO cv_score: 0.9452502689223402


Unnamed: 0,cv0,cv1,cv2,cv3,cv4
0,0.999956,0.999493,0.999383,0.996087,0.998046
1,0.036983,0.031808,0.023977,0.021656,0.036119
2,0.043167,0.029881,0.043879,0.283586,0.146053
3,0.000620,0.000590,0.000902,0.002655,0.000600
4,0.004417,0.000580,0.002961,0.014345,0.002389
...,...,...,...,...,...
39093,0.037968,0.048182,0.074997,0.034330,0.039072
39094,0.004810,0.004582,0.006945,0.007070,0.004355
39095,0.003610,0.003306,0.006017,0.001420,0.005508
39096,0.086567,0.086195,0.057432,0.186221,0.116294


In [23]:
test_y_pred = cv_pred_df.mean(axis=1)
pd.DataFrame({"id": test_id_list, "loan_status": test_y_pred}).to_csv(
    "./data/catboost_cv_mean.csv", index=False
)