# Install libs

In [1]:
!pip install catboost



# Global seed

In [2]:
import os
GLOBAL_SEED = 42

os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

# Load dataset

In [3]:
import numpy as np
import pandas as pd


dataset = pd.read_csv('/content/train.csv')

dataset = dataset.drop(columns=[
                      "customer_ref",
                      "application_id",
                      "referral_code",
                      "account_status_code",
                      "loan_officer_id",
                      "marketing_campaign",
                      "previous_zip_code",
                      "debt_service_ratio",
                      "revolving_balance",
                      "total_monthly_debt_payment",
                      "recent_inquiry_count",
                      "annual_income",
                      "oldest_credit_line_age",
                      "oldest_account_age_months",
                      "loan_term",
                      ])

# Undersampling

In [4]:
dataset = dataset.sample(frac=1, random_state=GLOBAL_SEED)

default_dataset = dataset.loc[dataset['default'] == 1]
non_default_dataset = dataset.loc[dataset['default'] == 0][:10000]

normal_distributed_df = pd.concat([default_dataset, non_default_dataset])

dataset = normal_distributed_df.sample(frac=1, random_state=GLOBAL_SEED)

dataset.head()

Unnamed: 0,application_hour,application_day_of_week,account_open_year,preferred_contact,num_login_sessions,num_customer_service_calls,has_mobile_app,paperless_billing,default,age,...,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow,state,regional_unemployment_rate,regional_median_income,regional_median_rent,housing_price_index,cost_of_living_index
62915,7.0,1.0,2015.0,Email,20.0,2.0,1,1.0,0.0,60.0,...,37373.76,3.900685,245188.0,1752.19,MO,4.0,55000.0,1340.0,86.0,73.0
84318,22.0,3.0,2017.0,Email,18.0,3.0,1,1.0,0.0,60.0,...,14600.76,0.048128,33087.9,1899.936667,FL,4.8,55000.0,1210.0,119.0,83.0
84868,18.0,5.0,2011.0,Phone,5.0,2.0,1,1.0,1.0,18.0,...,9467.4,0.265,24263.0,877.72,NC,3.9,54000.0,1250.0,112.0,100.0
30357,11.0,2.0,2010.0,Email,6.0,0.0,1,0.0,0.0,27.0,...,16109.64,0.214592,71020.0,2540.86,WI,4.2,60000.0,1450.0,102.0,81.0
48384,15.0,4.0,2012.0,Email,5.0,3.0,1,1.0,1.0,29.0,...,13457.88,0.435897,30572.5,1153.51,FL,5.2,55000.0,1090.0,118.0,99.0


In [5]:
y = dataset.default
X = dataset.drop(columns=['default'])

In [11]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

class PercentileWinsorizer(BaseEstimator, TransformerMixin):
    """Columnwise winsorization by percentiles (works on ndarray)."""
    def __init__(self, low=0.05, high=0.95):
        self.low = low
        self.high = high
        self.lo_ = None
        self.hi_ = None
    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        self.lo_ = np.nanpercentile(X, self.low*100, axis=0)
        self.hi_ = np.nanpercentile(X, self.high*100, axis=0)
        return self
    def transform(self, X):
        X = np.asarray(X, dtype=float)
        return np.clip(X, self.lo_, self.hi_)

numeric = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("winsor", PercentileWinsorizer(low=0.01, high=0.99)),
    ("sc",  StandardScaler(with_mean=True))
])

categorical = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=0.05, sparse_output=False))
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=GLOBAL_SEED)

X_train = pre.fit_transform(X_train, y_train)
X_test = pre.transform(X_test)

In [13]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",

    bootstrap_type="Bayesian",
    bagging_temperature=0.7,

    auto_class_weights="Balanced",
    depth=4,
    learning_rate=0.01,
    l2_leaf_reg=8.0,
    iterations=15000,
    early_stopping_rounds=1000,
    use_best_model=True,
    random_seed=GLOBAL_SEED,
    verbose=200,
)

In [14]:
model.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	test: 0.7390540	best: 0.7390540 (0)	total: 8.26ms	remaining: 2m 3s
200:	test: 0.7927323	best: 0.7927477 (199)	total: 1.35s	remaining: 1m 39s
400:	test: 0.7993200	best: 0.7993200 (400)	total: 2.68s	remaining: 1m 37s
600:	test: 0.8021377	best: 0.8021377 (600)	total: 4.53s	remaining: 1m 48s
800:	test: 0.8035659	best: 0.8035659 (800)	total: 6.81s	remaining: 2m
1000:	test: 0.8044187	best: 0.8044187 (1000)	total: 8.13s	remaining: 1m 53s
1200:	test: 0.8049970	best: 0.8050714 (1127)	total: 9.46s	remaining: 1m 48s
1400:	test: 0.8051256	best: 0.8051691 (1364)	total: 10.8s	remaining: 1m 44s
1600:	test: 0.8051275	best: 0.8053112 (1426)	total: 12.1s	remaining: 1m 41s
1800:	test: 0.8048451	best: 0.8053112 (1426)	total: 13.5s	remaining: 1m 38s
2000:	test: 0.8045667	best: 0.8053112 (1426)	total: 14.8s	remaining: 1m 36s
2200:	test: 0.8042021	best: 0.8053112 (1426)	total: 16.3s	remaining: 1m 34s
2400:	test: 0.8040619	best: 0.8053112 (1426)	total: 19.1s	remaining: 1m 40s
Stopped by overfitting detecto

<catboost.core.CatBoostClassifier at 0x7f22c4755970>

In [15]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, precision_recall_curve


y_score = model.predict_proba(X_test)[:, 1]
prec, rec, thr = precision_recall_curve(y_test, y_score)
f1s = 2*prec*rec/(prec+rec+1e-12)
best = int(np.argmax(f1s))
thr_best = float(thr[best-1]) if best > 0 else 0.5
y_pred = (y_score >= thr_best).astype(int)

roc  = roc_auc_score(y_test, y_score); gini = 2*roc - 1
print("ROC AUC:", roc, " | Gini:", gini)
print("PR  AUC:", average_precision_score(y_test, y_score))
print("F1(best):", f1s[best], " | threshold:", round(thr_best, 4))
print("Confusion [[TN,FP],[FN,TP]]:\n", confusion_matrix(y_test, y_pred))

ROC AUC: 0.8053112148746506  | Gini: 0.6106224297493013
PR  AUC: 0.661930911481315
F1(best): 0.636925188743496  | threshold: 0.5584
Confusion [[TN,FP],[FN,TP]]:
 [[1196  305]
 [ 225  464]]
