In [6]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
df = pd.read_csv("../fin_health.csv")

In [8]:
X = df.drop(columns=['ID', 'Target'])
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

In [10]:

cat_features = X_train.select_dtypes(
    include=['object', 'category', 'bool']
).columns.tolist()

print("Categorical features:", cat_features)


Categorical features: ['country', 'attitude_stable_business_environment', 'attitude_worried_shutdown', 'compliance_income_tax', 'perception_insurance_doesnt_cover_losses', 'perception_cannot_afford_insurance', 'motor_vehicle_insurance', 'has_mobile_money', 'current_problem_cash_flow', 'has_cellphone', 'owner_sex', 'offers_credit_to_customers', 'attitude_satisfied_with_achievement', 'has_credit_card', 'keeps_financial_records', 'perception_insurance_companies_dont_insure_businesses_like_yours', 'perception_insurance_important', 'has_insurance', 'covid_essential_service', 'attitude_more_successful_next_year', 'problem_sourcing_money', 'marketing_word_of_mouth', 'has_loan_account', 'has_internet_banking', 'has_debit_card', 'future_risk_theft_stock', 'medical_insurance', 'funeral_insurance', 'motivation_make_more_money', 'uses_friends_family_savings', 'uses_informal_lender']


In [11]:



# Replace NaN with a string and ensure string dtype
for col in cat_features:
    X_train[col] = X_train[col].fillna('__MISSING__').astype(str)
    X_test[col]  = X_test[col].fillna('__MISSING__').astype(str)


In [12]:
cat_idx = [X.columns.get_loc(col) for col in cat_features]

# Convert cat_features to pd.Categorical dtype
for col in cat_features:
    X_train[col] = pd.Categorical(X_train[col])
    X_test[col] = pd.Categorical(X_test[col])

In [None]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import numpy as np
import time
import gc


# -------------------------------
# Optuna objective for CatBoost
# -------------------------------
def objective(trial):

    params = {
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        # 'task_type': 'GPU',          # GPU training
        'devices': '0',
        'random_seed': 2026,
        'verbose': 0,
        'auto_class_weights':'Balanced',

        # Core boosting params
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 100.0, log=True),

        # Sampling / regularization
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),

        # Overfitting control
        'od_type': 'Iter',
        'od_wait': 100,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    logloss_scores = []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train_enc)):

        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train_enc[train_idx], y_train_enc[valid_idx]

        train_pool = Pool(
            data=X_tr,
            label=y_tr,
            cat_features=cat_idx
        )

        valid_pool = Pool(
            data=X_val,
            label=y_val,
            cat_features=cat_idx
        )

        model = CatBoostClassifier(
            **params,
            iterations=5000
        )

        model.fit(
            train_pool,
            eval_set=valid_pool,
            use_best_model=True,
            verbose=False
        )

        pred_val = model.predict_proba(valid_pool)
        fold_logloss = log_loss(y_val, pred_val)
        logloss_scores.append(fold_logloss)

        trial.report(fold_logloss, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(logloss_scores)


In [16]:
# -------------------------------
# Optuna study
# -------------------------------
debug = True
timeout = 60 if debug else 3600

start = time.time()

study = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.SuccessiveHalvingPruner(
        min_resource=2,
        reduction_factor=4,
        min_early_stopping_rate=1
    )
)

study.optimize(objective, n_trials=30)

end = time.time()

print(f"Optuna finished in {end - start:.2f} seconds")
print("Best params:", study.best_params)
print("Best CV MultiClass logloss:", study.best_value)


[I 2026-01-02 17:40:23,929] A new study created in memory with name: no-name-c2eeec70-c5d5-49f4-b925-a3afd34936cb
[I 2026-01-02 17:40:51,318] Trial 0 finished with value: 0.32402345159522644 and parameters: {'learning_rate': 0.16700163078468566, 'depth': 8, 'l2_leaf_reg': 0.024308162229904305, 'bagging_temperature': 0.061978602717648545, 'random_strength': 5.922144742974767, 'border_count': 67}. Best is trial 0 with value: 0.32402345159522644.
[I 2026-01-02 17:41:57,883] Trial 1 finished with value: 0.31144767885535396 and parameters: {'learning_rate': 0.06147650320180685, 'depth': 5, 'l2_leaf_reg': 1.1223317106964992, 'bagging_temperature': 0.6137307548387722, 'random_strength': 2.6113438933611235, 'border_count': 148}. Best is trial 1 with value: 0.31144767885535396.
[I 2026-01-02 17:44:37,575] Trial 2 finished with value: 0.32236048019228336 and parameters: {'learning_rate': 0.03522259550259098, 'depth': 10, 'l2_leaf_reg': 1.0086602510035325, 'bagging_temperature': 0.205182605085300

Optuna finished in 3864.16 seconds
Best params: {'learning_rate': 0.06725811144528497, 'depth': 5, 'l2_leaf_reg': 0.0906511943796095, 'bagging_temperature': 0.30252261042580786, 'random_strength': 1.0075657547236032, 'border_count': 161}
Best CV MultiClass logloss: 0.3108645274390119


In [17]:
# -------------------------------
# Train final CatBoost model
# -------------------------------
best_params = study.best_params
best_params.update({
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'task_type': 'GPU',
    'devices': '0',
    'random_seed': 2026,
    'verbose': 100
})

train_pool_full = Pool(
    X_train,
    label=y_train_enc,
    cat_features=cat_idx
)

test_pool_full = Pool(
    X_test,
    label=y_test_enc,
    cat_features=cat_idx
)

print("\nTraining final CatBoost model...")
final_model = CatBoostClassifier(
    **best_params,
    iterations=5000
)

final_model.fit(
    train_pool_full,
    eval_set=test_pool_full,
    use_best_model=True
)

# -------------------------------
# Evaluate on test set
# -------------------------------
pred_test = final_model.predict_proba(test_pool_full)

roc_auc = roc_auc_score(
    y_test_enc,
    pred_test,
    multi_class='ovr',
    average='weighted'
)

logloss = log_loss(y_test_enc, pred_test)
accuracy = accuracy_score(y_test_enc, np.argmax(pred_test, axis=1))

print("\n--- Test set performance ---")
print(f"ROC AUC (OVR weighted): {roc_auc:.5f}")
print(f"Log Loss:              {logloss:.5f}")
print(f"Accuracy:              {accuracy:.5f}")

gc.collect()



Training final CatBoost model...
0:	learn: 1.0561426	test: 1.0580108	best: 1.0580108 (0)	total: 21.4ms	remaining: 1m 46s
100:	learn: 0.3111104	test: 0.3362601	best: 0.3362601 (100)	total: 1.9s	remaining: 1m 32s
200:	learn: 0.2765845	test: 0.3205601	best: 0.3205601 (200)	total: 3.77s	remaining: 1m 29s
300:	learn: 0.2548898	test: 0.3172264	best: 0.3172127 (298)	total: 5.62s	remaining: 1m 27s
400:	learn: 0.2366623	test: 0.3163379	best: 0.3160725 (345)	total: 7.17s	remaining: 1m 22s
500:	learn: 0.2206654	test: 0.3153575	best: 0.3150845 (497)	total: 9.12s	remaining: 1m 21s
600:	learn: 0.2073300	test: 0.3151167	best: 0.3149005 (596)	total: 11.1s	remaining: 1m 21s
700:	learn: 0.1940264	test: 0.3145963	best: 0.3145963 (700)	total: 13s	remaining: 1m 19s
800:	learn: 0.1827348	test: 0.3153163	best: 0.3142121 (703)	total: 14.9s	remaining: 1m 18s
900:	learn: 0.1720115	test: 0.3165357	best: 0.3142121 (703)	total: 16.8s	remaining: 1m 16s
1000:	learn: 0.1626376	test: 0.3179681	best: 0.3142121 (703)	t

0

In [8]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from scipy.stats import randint, uniform
import time


#param_dist = {
#    'depth': randint(4, 10),
#    'learning_rate': uniform(0.01, 0.15),
#    'l2_leaf_reg': uniform(1.0, 9.0),
#    'bagging_temperature': uniform(0.0, 1.0),
#}

param_dist = {
    'depth': randint(5, 9),
    'learning_rate': uniform(0.03, 0.1),
    'l2_leaf_reg': uniform(2.0, 6.0),
    'bagging_temperature': uniform(0.0, 0.7)
    
}

base_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='MultiClass',
    iterations=800,
    early_stopping_rounds=50,    # âœ… STOP BAD MODELS
    random_seed=42,
    cat_features=cat_features,
    verbose=0
)

search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_log_loss',
    refit=True,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

start = time.time()
search.fit(X_train, y_train_enc)
end = time.time()

print("Best params:", search.best_params_)
print("Best CV score (neg log loss):", search.best_score_)
print(f"Search time: {end - start:.2f} seconds")




Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'bagging_temperature': 0.5658781436815228, 'depth': 5, 'l2_leaf_reg': 2.095797513321285, 'learning_rate': 0.053089382562214904}
Best CV score (neg log loss): -0.31063124952755955
Search time: 1287.80 seconds


In [9]:
num_classes = len(np.unique(y_train_enc))

final_model = CatBoostClassifier(
    **search.best_params_,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    iterations=5000,
    random_seed=42,
    cat_features=cat_features,
    verbose=50,
    od_type='Iter',
    od_wait=100
)

start = time.time()
final_model.fit(
    X_train, y_train_enc,
    eval_set=(X_test, y_test_enc),
    use_best_model=True
)
end = time.time()

print(f"\nBest iteration: {final_model.get_best_iteration()}")
print(f"Training time: {end - start:.2f} seconds")


0:	learn: 1.0433335	test: 1.0439402	best: 1.0439402 (0)	total: 14ms	remaining: 1m 10s
50:	learn: 0.3882167	test: 0.3951008	best: 0.3951008 (50)	total: 731ms	remaining: 1m 10s
100:	learn: 0.3259079	test: 0.3367241	best: 0.3367241 (100)	total: 1.47s	remaining: 1m 11s
150:	learn: 0.3121247	test: 0.3250059	best: 0.3250059 (150)	total: 2.2s	remaining: 1m 10s
200:	learn: 0.3035733	test: 0.3198475	best: 0.3198475 (200)	total: 2.94s	remaining: 1m 10s
250:	learn: 0.2974071	test: 0.3165250	best: 0.3165250 (250)	total: 3.68s	remaining: 1m 9s
300:	learn: 0.2919034	test: 0.3143257	best: 0.3143257 (300)	total: 4.39s	remaining: 1m 8s
350:	learn: 0.2875650	test: 0.3126246	best: 0.3126129 (346)	total: 5.13s	remaining: 1m 8s
400:	learn: 0.2829771	test: 0.3117743	best: 0.3117743 (400)	total: 5.89s	remaining: 1m 7s
450:	learn: 0.2785417	test: 0.3107034	best: 0.3107034 (450)	total: 6.68s	remaining: 1m 7s
500:	learn: 0.2738285	test: 0.3097478	best: 0.3097471 (497)	total: 7.43s	remaining: 1m 6s
550:	learn: 0

In [10]:
proba = final_model.predict_proba(X_test)
pred  = np.argmax(proba, axis=1)

print("\n--- CatBoost Test Performance ---")
print(f"ROC AUC (ovr weighted): {roc_auc_score(y_test_enc, proba, multi_class='ovr', average='weighted'):.5f}")
print(f"Log Loss: {log_loss(y_test_enc, proba):.5f}")
print(f"Accuracy: {accuracy_score(y_test_enc, pred):.5f}")



--- CatBoost Test Performance ---
ROC AUC (ovr weighted): 0.94567
Log Loss: 0.30498
Accuracy: 0.87734
