In [1]:
from __future__ import annotations

import datetime
import logging

import numpy as np
import pandas as pd
import plotly.express as px
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level="INFO")
log = logging.getLogger("notebook")

In [6]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
test_id_list = list(test_df["id"])

In [30]:
train_df["person_age_1"] = (
    train_df["person_age"].clip(lower=0, upper=99).map(lambda x: x // 10).value_counts()
)
test_df["person_age_1"] = test_df["person_age"].clip(lower=0, upper=99).map(lambda x: x // 10).value_counts()

In [32]:
cat_features = ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]
num_features = [
    'person_age',
    'person_age_1',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
]
catboost_clf_params = {
    "iterations": 1000,
    "learning_rate": 0.03,
    "depth": 10,
    "l2_leaf_reg": 17,
    "random_strength": 11,
    # "subsample": 0.95,
    "verbose": 1,
    "cat_features": cat_features,
    "random_seed": 9999,
}
catboost_clf = CatBoostClassifier(**catboost_clf_params)
catboost_clf.fit(train_df[cat_features + num_features], train_df["loan_status"])
test_yhat = catboost_clf.predict(test_df[cat_features + num_features])
# pd.DataFrame({"id": test_id_list, "loan_status": test_yhat}).to_csv("./data/submission.csv", index=False)

0:	learn: 0.6555470	total: 45.8ms	remaining: 45.7s
1:	learn: 0.6202857	total: 89.3ms	remaining: 44.5s
2:	learn: 0.5991854	total: 104ms	remaining: 34.5s
3:	learn: 0.5663733	total: 152ms	remaining: 37.8s
4:	learn: 0.5486024	total: 170ms	remaining: 33.9s
5:	learn: 0.5321358	total: 181ms	remaining: 30.1s
6:	learn: 0.5084453	total: 199ms	remaining: 28.3s
7:	learn: 0.4849004	total: 241ms	remaining: 29.9s
8:	learn: 0.4708464	total: 266ms	remaining: 29.3s
9:	learn: 0.4494196	total: 325ms	remaining: 32.1s
10:	learn: 0.4362708	total: 343ms	remaining: 30.8s
11:	learn: 0.4181908	total: 394ms	remaining: 32.5s
12:	learn: 0.4070185	total: 416ms	remaining: 31.6s
13:	learn: 0.3966637	total: 441ms	remaining: 31s
14:	learn: 0.3833193	total: 500ms	remaining: 32.8s
15:	learn: 0.3730774	total: 532ms	remaining: 32.7s
16:	learn: 0.3629519	total: 560ms	remaining: 32.4s
17:	learn: 0.3512333	total: 614ms	remaining: 33.5s
18:	learn: 0.3451689	total: 641ms	remaining: 33.1s
19:	learn: 0.3404255	total: 656ms	remaini

In [19]:
pd.DataFrame(
    {
        "ratio": train_df.groupby("person_age").apply(
            lambda subset: subset["loan_status"].sum() / subset.shape[0], include_groups=False
        ),
        "amount": train_df["person_age"].value_counts(),
    }
)

Unnamed: 0_level_0,ratio,amount
person_age,Unnamed: 1_level_1,Unnamed: 2_level_1
20,0.25,12
21,0.171031,1795
22,0.160261,7051
23,0.138105,7726
24,0.140422,6395
25,0.148017,5067
26,0.147909,3874
27,0.119101,4450
28,0.140815,3707
29,0.131498,3270


In [7]:
def bin_price(df):
    q1 = np.percentile(df["price"], 25)
    q3 = np.percentile(df["price"], 75)
    upper_bound = q3 + 1.5 * (q3 - q1)
    df["price_bin"] = (df["price"] < upper_bound).astype(int)
    return df


log.info("Bin Price: train_df")
train_df = bin_price(train_df)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.90,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,58640,34,120000,MORTGAGE,5.0,EDUCATION,D,25000,15.95,0.21,Y,10,0
58641,58641,28,28800,RENT,0.0,MEDICAL,C,10000,12.73,0.35,N,8,1
58642,58642,23,44000,RENT,7.0,EDUCATION,D,6800,16.00,0.15,N,2,1
58643,58643,22,30000,RENT,2.0,EDUCATION,A,5000,8.90,0.17,N,3,0
