In [90]:
import pandas as pd

train_df = pd.read_csv("./data/train.csv")
val_df = pd.read_csv("./data/val.csv")

In [16]:
train_df.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'annual_inc', 'dti', 'inq_last_6mths', 'delinq_2yrs', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'loan_risk',
       'installment_to_income', 'fico_score', 'credit_age', 'issue_month',
       'fed_funds_rate', 'unemployment_rate', 'cpi', 'real_gdp',
       'debt_service_ratio', 'car', 'credit_card', 'debt_consolidation',
       'educational', 'home_improvement', 'house', 'major_purchase', 'medical',
       'moving', 'other', 'renewable_energy', 'small_business', 'vacation',
       'wedding', 'ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT'],
      dtype='object')

In [57]:
#train_df["int_rate"].max() # 30.99
#train_df["int_rate"].min() # 5.31
#train_df["emp_length"].min() # -1
#train_df["emp_length"].max() # 10
#train_df["dti"].min() # -1
#train_df["dti"].max() # 999
#train_df["inq_last_6mths"].agg(['min', 'max'])
#train_df["delinq_2yrs"].agg(['min', 'max'])
#train_df["open_acc"].agg(['min', 'max'])
#train_df["pub_rec"].agg(['min', 'max'])
#train_df["revol_bal"].agg(['min', 'max'])
#train_df["revol_util"].agg(['min', 'max'])
#train_df["total_acc"].agg(['min', 'max'])
#train_df["installment_to_income"].agg(['min', 'max'])
#train_df["fico_score"].agg(['min', 'max'])
#train_df["credit_age"].agg(['min', 'max'])
#train_df["fed_funds_rate"].agg(['min', 'max'])
#train_df["unemployment_rate"].agg(['min', 'max'])
#train_df["cpi"].agg(['min', 'max'])
#train_df["real_gdp"].agg(['min', 'max'])
#train_df["debt_service_ratio"].agg(['min', 'max'])

'''
import matplotlib.pyplot as plt

plt.hist(train_df['revol_util'], bins=50)
plt.title('Revolving Utilization Distribution')
plt.xlabel('Revol Util')
plt.ylabel('Frequency')
plt.show()
'''

"\nimport matplotlib.pyplot as plt\n\nplt.hist(train_df['revol_util'], bins=50)\nplt.title('Revolving Utilization Distribution')\nplt.xlabel('Revol Util')\nplt.ylabel('Frequency')\nplt.show()\n"

In [91]:
y_train = train_df["loan_risk"]
X_train = train_df.drop("loan_risk", axis=1)
y_val = val_df["loan_risk"]
X_val = val_df.drop("loan_risk", axis=1)

In [92]:
from dataclasses import dataclass, field
from typing import Dict, Tuple

@dataclass(frozen=True)
class DataPolicy:
    percent: Tuple[str, ...] = ("int_rate", "fed_funds_rate", "unemployment_rate", "debt_service_ratio", "revol_util") # 0 to 100
    non_negative: Tuple[str, ...] = ("loan_amnt", "installment", "annual_inc", "delinq_2yrs", "open_acc", "pub_rec", "revol_bal", "total_acc", "installment_to_income", "credit_age") # Will be winsorized
    skip: Tuple[str, ...] = ("term", "loan_rism", 'car', 'credit_card', 'debt_consolidation',
       'educational', 'home_improvement', 'house', 'major_purchase', 'medical',
       'moving', 'other', 'renewable_energy', 'small_business', 'vacation',
       'wedding', 'ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT', "real_gdp") # Doesnt do anything, safety

    # Allowable ranges
    bounds: Dict[str, Tuple[int, int]] = field(default_factory=lambda: {
        "emp_length": (-1, 10),
        "dti": (-1, 100),
        "inq_last_6mths": (0, 10),
        "fico_score": (350, 850),
        "cpi": (100, 500)
    })

    upper_cap = 0.99

In [93]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import is_numeric_dtype
from typing import Dict

class DataTransform(BaseEstimator, TransformerMixin):
    def __init__(self, policy: DataPolicy = DataPolicy()):
        self.policy = policy
        self.caps = {}

    def binaryCol(self, c: pd.Series):
        return set(pd.unique(c.dropna())) == {0, 1}
        
    def fit(self, X: pd.DataFrame, y=None):
        df = X.copy()
        # Find cap for each column in non_negative policy
        for col in self.policy.non_negative:
            if col in df.columns and is_numeric_dtype(df[col]) and not self.binaryCol(df[col]):
                self.caps[col] = float(df[col].quantile(self.policy.upper_cap))

        return self

    def transform(self, X: pd.DataFrame):
        df = X.copy()

        skip = set(self.policy.skip) # For safety
        
        for col in self.policy.percent:
            if col in df.columns and col not in skip:
                df[col] = pd.to_numeric(df[col], errors="coerce").clip(0, 100) # Limit to between 0, 100

        for col in self.policy.non_negative:
            if col in df.columns and col not in skip:
                df[col] = pd.to_numeric(df[col], errors="coerce").clip(lower=0)
                if col in self.caps:
                    df[col] = pd.to_numeric(df[col], errors="coerce").clip(upper=self.caps[col])

        # Bounds
        bounds = self.policy.bounds
        for col in bounds:
            if col in df.columns and col not in skip:
                df[col] = pd.to_numeric(df[col], errors="coerce").clip(bounds[col][0], bounds[col][1])

        return df

In [108]:
# LightGBM (CPU)

from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
import lightgbm as lgb

lgbm_model = Pipeline([
    ("transform", DataTransform()),
    ("clf", lgb.LGBMClassifier(
        n_estimators=4000, learning_rate=0.02,
        num_leaves=31, subsample=0.8,
        min_child_samples=100, reg_alpha=1.0, reg_lambda=1.0,
        colsample_bytree=0.8, objective="binary", 
        is_unbalance=True, random_state=42
    ))
])

In [109]:
lgbm_model.fit(
    X_train, y_train,
    clf__eval_set=[(X_val, y_val)],
    clf__eval_metric="average_precision",
    clf__callbacks=[
        lgb.early_stopping(stopping_rounds=200, first_metric_only=True),
        lgb.log_evaluation(period=50)
    ]
)
print("Done training.")
val_pred = lgbm_model.predict_proba(X_val)[:, 1]
print("Finished predicitons.")
print(f"Avg prec. score: {average_precision_score(y_val, val_pred)}\nROC AUC Score: {roc_auc_score(y_val, val_pred)}")


[LightGBM] [Info] Number of positive: 199033, number of negative: 1380977
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2401
[LightGBM] [Info] Number of data points in the train set: 1580010, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.125969 -> initscore=-1.937076
[LightGBM] [Info] Start training from score -1.937076
Training until validation scores don't improve for 200 rounds
[50]	valid_0's average_precision: 0.252527	valid_0's binary_logloss: 0.517476
[100]	valid_0's average_precision: 0.25502	valid_0's binary_logloss: 0.623692
[150]	valid_0's average_precision: 0.256999	valid_0's binary_logloss: 0.667496
[200]	valid_0's average_precision: 0.258864	valid_0's binary_logloss: 0.680724
[250]	valid_0's average_precision: 0.260199	valid_0's binary_

In [115]:
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import brier_score_loss

# Calibrate
cal = IsotonicRegression(out_of_bounds="clip")
cal.fit(val_pred, y_val.astype(int))

val_pd = cal.predict(val_pred)
print("Val Brier:", brier_score_loss(y_val, val_pd))


Val Brier: 0.11149527235774853


In [116]:
print(f"Avg prec. score: {average_precision_score(y_val, val_pd)}\nROC AUC Score: {roc_auc_score(y_val, val_pd)}")

Avg prec. score: 0.26302077987063066
ROC AUC Score: 0.6996964334222409


In [None]:
# XGBoost (GPU)
