In [5]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
df = pd.read_csv("../fin_health.csv")

In [6]:
X = df.drop(columns=['ID', 'Target'])
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

In [None]:

cat_features = X_train.select_dtypes(
    include=['object', 'category', 'bool']
).columns.tolist()

print("Categorical features:", cat_features)


Categorical features: ['country', 'attitude_stable_business_environment', 'attitude_worried_shutdown', 'compliance_income_tax', 'perception_insurance_doesnt_cover_losses', 'perception_cannot_afford_insurance', 'motor_vehicle_insurance', 'has_mobile_money', 'current_problem_cash_flow', 'has_cellphone', 'owner_sex', 'offers_credit_to_customers', 'attitude_satisfied_with_achievement', 'has_credit_card', 'keeps_financial_records', 'perception_insurance_companies_dont_insure_businesses_like_yours', 'perception_insurance_important', 'has_insurance', 'covid_essential_service', 'attitude_more_successful_next_year', 'problem_sourcing_money', 'marketing_word_of_mouth', 'has_loan_account', 'has_internet_banking', 'has_debit_card', 'future_risk_theft_stock', 'medical_insurance', 'funeral_insurance', 'motivation_make_more_money', 'uses_friends_family_savings', 'uses_informal_lender']


In [10]:


# Replace NaN with a string and ensure string dtype
for col in cat_features:
    X_train[col] = X_train[col].fillna('__MISSING__').astype(str)
    X_test[col]  = X_test[col].fillna('__MISSING__').astype(str)


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from scipy.stats import randint, uniform
import time


#param_dist = {
#    'depth': randint(4, 10),
#    'learning_rate': uniform(0.01, 0.15),
#    'l2_leaf_reg': uniform(1.0, 9.0),
#    'bagging_temperature': uniform(0.0, 1.0),
#}

param_dist = {
    'depth': randint(5, 9),
    'learning_rate': uniform(0.03, 0.1),
    'l2_leaf_reg': uniform(2.0, 6.0),
    'bagging_temperature': uniform(0.0, 0.7)
    
}

base_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='MultiClass',
    iterations=800,
    early_stopping_rounds=50,    # âœ… STOP BAD MODELS
    random_seed=42,
    cat_features=cat_features,
    verbose=0
)

search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_log_loss',
    refit=True,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

start = time.time()
search.fit(X_train, y_train_enc)
end = time.time()

print("Best params:", search.best_params_)
print("Best CV score (neg log loss):", search.best_score_)
print(f"Search time: {end - start:.2f} seconds")




Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
num_classes = len(np.unique(y_train_enc))

final_model = CatBoostClassifier(
    **search.best_params_,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    iterations=5000,
    random_seed=42,
    cat_features=cat_features,
    verbose=50,
    od_type='Iter',
    od_wait=100
)

start = time.time()
final_model.fit(
    X_train, y_train_enc,
    eval_set=(X_test, y_test_enc),
    use_best_model=True
)
end = time.time()

print(f"\nBest iteration: {final_model.get_best_iteration()}")
print(f"Training time: {end - start:.2f} seconds")


In [None]:
proba = final_model.predict_proba(X_test)
pred  = np.argmax(proba, axis=1)

print("\n--- CatBoost Test Performance ---")
print(f"ROC AUC (ovr weighted): {roc_auc_score(y_test_enc, proba, multi_class='ovr', average='weighted'):.5f}")
print(f"Log Loss: {log_loss(y_test_enc, proba):.5f}")
print(f"Accuracy: {accuracy_score(y_test_enc, pred):.5f}")
