# Install libs

In [1]:
!pip install catboost



# Global seed

In [2]:
import os
GLOBAL_SEED = 42

os.environ["PYTHONHASHSEED"] = str(GLOBAL_SEED)

# Load dataset

In [3]:
import numpy as np
import pandas as pd


train_dataset = pd.read_csv('/content/full_dataset.csv')

train_dataset = train_dataset.drop(columns=[
                      "customer_ref",
                      "application_id",
                      "referral_code",
                      "account_status_code",
                      "loan_officer_id",
                      "marketing_campaign",
                      "previous_zip_code",
                      "debt_service_ratio",
                      "revolving_balance",
                      "total_monthly_debt_payment",
                      "recent_inquiry_count",
                      "annual_income",
                      "oldest_credit_line_age",
                      "oldest_account_age_months",
                      "loan_term",
                      "credit_usage_amount",
                      "loan_to_value_ratio",
                      "debt_to_income_ratio",
                      "payment_to_income_ratio",
                      "num_inquiries_6mo"
                      ])

test_dataset = pd.read_csv('/content/eval_dataset.csv')

X_test_raw = test_dataset.drop(columns=[
                      "account_status_code",
                      "marketing_campaign",
                      "previous_zip_code",
                      "debt_service_ratio",
                      "revolving_balance",
                      "total_monthly_debt_payment",
                      "recent_inquiry_count",
                      "oldest_credit_line_age",
                      "loan_term",
                      ])
X_test = test_dataset.drop(columns=[
                      "customer_ref",
                      "account_status_code",
                      "marketing_campaign",
                      "previous_zip_code",
                      "debt_service_ratio",
                      "revolving_balance",
                      "total_monthly_debt_payment",
                      "recent_inquiry_count",
                      "oldest_credit_line_age",
                      "loan_term",
                      ])

# Undersampling

In [4]:
train_dataset = train_dataset.sample(frac=1, random_state=GLOBAL_SEED)

default_dataset = train_dataset.loc[train_dataset['default'] == 1]
non_default_dataset = train_dataset.loc[train_dataset['default'] == 0][:10000]

normal_distributed_df = pd.concat([default_dataset, non_default_dataset])

train_dataset = normal_distributed_df.sample(frac=1, random_state=GLOBAL_SEED)

train_dataset.head()

Unnamed: 0,application_hour,application_day_of_week,account_open_year,preferred_contact,num_login_sessions,num_customer_service_calls,has_mobile_app,paperless_billing,default,age,...,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow,state,regional_unemployment_rate,regional_median_income,regional_median_rent,housing_price_index,cost_of_living_index
62915,7.0,1.0,2015.0,Email,20.0,2.0,1,1.0,0.0,60.0,...,37373.76,3.900685,245188.0,1752.19,MO,4.0,55000.0,1340.0,86.0,73.0
84318,22.0,3.0,2017.0,Email,18.0,3.0,1,1.0,0.0,60.0,...,14600.76,0.048128,33087.9,1899.936667,FL,4.8,55000.0,1210.0,119.0,83.0
84868,18.0,5.0,2011.0,Phone,5.0,2.0,1,1.0,1.0,18.0,...,9467.4,0.265,24263.0,877.72,NC,3.9,54000.0,1250.0,112.0,100.0
30357,11.0,2.0,2010.0,Email,6.0,0.0,1,0.0,0.0,27.0,...,16109.64,0.214592,71020.0,2540.86,WI,4.2,60000.0,1450.0,102.0,81.0
48384,15.0,4.0,2012.0,Email,5.0,3.0,1,1.0,1.0,29.0,...,13457.88,0.435897,30572.5,1153.51,FL,5.2,55000.0,1090.0,118.0,99.0


In [5]:
y_train = train_dataset.default
X_train = train_dataset.drop(columns=['default'])

In [6]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

class PercentileWinsorizer(BaseEstimator, TransformerMixin):
    """Columnwise winsorization by percentiles (works on ndarray)."""
    def __init__(self, low=0.05, high=0.95):
        self.low = low
        self.high = high
        self.lo_ = None
        self.hi_ = None
    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        self.lo_ = np.nanpercentile(X, self.low*100, axis=0)
        self.hi_ = np.nanpercentile(X, self.high*100, axis=0)
        return self
    def transform(self, X):
        X = np.asarray(X, dtype=float)
        return np.clip(X, self.lo_, self.hi_)

numeric = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("winsor", PercentileWinsorizer(low=0.01, high=0.99)),
    ("sc",  StandardScaler(with_mean=True))
])

categorical = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=0.05, sparse_output=False))
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

X_train = pre.fit_transform(X_train, y_train)
X_val = pre.transform(X_val)
X_test = pre.transform(X_test)

In [8]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",

    bootstrap_type="Bayesian",
    bagging_temperature=0.7,

    auto_class_weights="Balanced",
    depth=4,
    learning_rate=0.01,
    l2_leaf_reg=8.0,
    iterations=15000,
    early_stopping_rounds=500,
    use_best_model=True,
    random_seed=GLOBAL_SEED,
    verbose=200,
)

In [9]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	test: 0.7481534	best: 0.7481534 (0)	total: 83.1ms	remaining: 20m 45s
200:	test: 0.7925466	best: 0.7925466 (200)	total: 4.38s	remaining: 5m 22s
400:	test: 0.7992640	best: 0.7992640 (400)	total: 9.88s	remaining: 5m 59s
600:	test: 0.8025371	best: 0.8025371 (600)	total: 15.8s	remaining: 6m 18s
800:	test: 0.8038279	best: 0.8038550 (795)	total: 20.9s	remaining: 6m 11s
1000:	test: 0.8044709	best: 0.8046024 (955)	total: 27.5s	remaining: 6m 24s
1200:	test: 0.8048219	best: 0.8048925 (1178)	total: 32.5s	remaining: 6m 13s
1400:	test: 0.8049680	best: 0.8050917 (1355)	total: 37.6s	remaining: 6m 4s
1600:	test: 0.8051990	best: 0.8052832 (1543)	total: 40.7s	remaining: 5m 40s
1800:	test: 0.8051594	best: 0.8052832 (1543)	total: 44.1s	remaining: 5m 23s
2000:	test: 0.8050917	best: 0.8052832 (1543)	total: 46.7s	remaining: 5m 3s
2200:	test: 0.8047407	best: 0.8052832 (1543)	total: 50.5s	remaining: 4m 53s
2400:	test: 0.8048210	best: 0.8052832 (1543)	total: 54.2s	remaining: 4m 44s
Stopped by overfitting dete

<catboost.core.CatBoostClassifier at 0x7c6bdc8a07a0>

In [10]:
prob = model.predict_proba(X_test)[:, 1]
pred = (prob >= 0.5).astype(int)

In [11]:
out = pd.DataFrame({
    "customer_ref": X_test_raw["customer_ref"],
    "probability": prob,
    "prediction": pred
})
out.to_csv("results.csv", index=False)