In [1]:
import warnings
warnings.filterwarnings('ignore')

from autogluon.tabular import TabularPredictor
from concurrent.futures import ThreadPoolExecutor
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
)
from lightgbm import LGBMRegressor, LGBMClassifier
import numpy as np

from openfe import OpenFE, transform, tree_to_formula, formula_to_tree, get_candidate_features
import os
import pandas as pd
from pprint import pprint

import random

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score

from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

experiment_name = 'modelling'

In [2]:
n_splits = 3

k3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [4]:
train = pd.read_csv(r'.\data\train.csv')
test = pd.read_csv(r'.\data\val.csv')

train.shape, test.shape

((52780, 13), (5865, 13))

In [5]:
train.sample(3)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
48012,46490,23,60000,RENT,1.0,DEBTCONSOLIDATION,C,4000,13.11,0.07,N,4,0
21289,52129,23,40000,RENT,1.0,DEBTCONSOLIDATION,A,7500,5.79,0.19,N,3,0
33634,14112,22,54000,MORTGAGE,6.0,EDUCATION,A,6800,6.03,0.13,N,4,0


In [6]:
ohe = pd.get_dummies(train, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True, dtype='int')
ohe_test = pd.get_dummies(test, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], drop_first=True, dtype='int')
ohe.head()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,46159,27,100000,11.0,20000,10.75,0.2,9,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,33263,22,78000,6.0,4400,7.9,0.06,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2,26874,24,43200,6.0,12250,8.94,0.28,4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,24524,31,38896,12.0,8125,10.59,0.21,6,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0
4,30961,28,34000,3.0,10000,14.91,0.29,9,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0


In [7]:
def rename_columns(data):
    df = data.copy()
    df.columns = df.columns.str.lower() #only lower case
    df.columns = df.columns.str.replace(r'[\(\)\']', '', regex=True) #remove special characters
    df.columns = df.columns.str.replace(r'[ \/]', '_', regex = True) #replace white space and / with _
    df.columns = df.columns.str.replace(r'[.]', '', regex = True) #remove . in names
    df.columns = df.columns.str.replace(r'[%]', 'perc', regex = True) #replace % with word
    df.columns = df.columns.str.replace(r'[ ()]', '_', regex = True) #replace % with word
    return df

In [8]:
train_rename = rename_columns(ohe)
test_rename = rename_columns(ohe_test)
train_rename.head()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,person_home_ownership_other,person_home_ownership_own,person_home_ownership_rent,loan_intent_education,loan_intent_homeimprovement,loan_intent_medical,loan_intent_personal,loan_intent_venture,loan_grade_b,loan_grade_c,loan_grade_d,loan_grade_e,loan_grade_f,loan_grade_g,cb_person_default_on_file_y
0,46159,27,100000,11.0,20000,10.75,0.2,9,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,33263,22,78000,6.0,4400,7.9,0.06,2,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2,26874,24,43200,6.0,12250,8.94,0.28,4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,24524,31,38896,12.0,8125,10.59,0.21,6,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0
4,30961,28,34000,3.0,10000,14.91,0.29,9,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0


In [9]:
# Define categorical columns
cat_cols = ['person_home_ownership_other', 'person_home_ownership_own', 'person_home_ownership_rent', 'loan_intent_education', 'loan_intent_homeimprovement', 'loan_intent_medical', 'loan_intent_personal', 'loan_intent_venture', 'loan_grade_b', 'loan_grade_c', 'loan_grade_d', 'loan_grade_e', 'loan_grade_f', 'loan_grade_g', 'cb_person_default_on_file_y']

# Define numerical columns
num_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Define columns to be dropped because all values are NaN or not dependable based on scrape
drop_cols = ['id']

# Define the target variable
TARGET = 'loan_status'

In [10]:
# Drop the features not required
ohe_drop = train_rename.drop(drop_cols, axis=1)
ohe_drop_test = test_rename.drop(drop_cols, axis=1)

In [11]:
ohe_drop.shape, ohe_drop_test.shape

((52780, 23), (5865, 23))

In [12]:
# Define epsilon
epsilon = 1e-10

columns_for_epsilon = [col for col in list(ohe_drop.columns) if col != TARGET]

for column in columns_for_epsilon:
    ohe_drop[column] += epsilon

In [13]:
# Iterate through columns_for_epsilon
for column in columns_for_epsilon:
    if column in ohe_drop_test.columns:
        # Only add epsilon if the column exists in the DataFrame
        ohe_drop_test[column] += epsilon
    else:
        # Optionally, log a message or handle missing columns
        print(f"Column '{column}' not found in the DataFrame")

In [14]:
# with tol=1 we tell the transformer to remove constant features
# with tol=0.95 we tell the transformer that we want to remove
# all features that show the same value in more than 95% (quasi-constant)
constant = DropConstantFeatures(tol=1)

# finds the constant features on the train set
constant.fit(ohe_drop.drop(TARGET, axis=1))

len(constant.features_to_drop_)

0

In [15]:
constant_drop = constant.features_to_drop_
constant_drop

[]

In [16]:
# Assign X and y
X_feat_eng = ohe_drop.drop([TARGET] + constant_drop, axis=1)
y_feat_eng = ohe_drop[TARGET]

X_feat_eng.shape, y_feat_eng.shape

((52780, 22), (52780,))

# Train-Test CV

In [17]:
# Assign X and y for baseline
X = X_feat_eng.copy()
# X = train_w_domain_feats.copy()
# X = X_train.copy()
# X = dup_cols.drop(duplicate_drop, axis=1)
y = y_feat_eng.copy()

test_X = ohe_drop_test.copy()
# test_X = test_w_domain_feats.copy()
# test_X = X_test.copy()
# test_X = dup_cols_test.copy()

X.shape, y.shape, test_X.shape

((52780, 22), (52780,), (5865, 23))

In [18]:
missing_columns = set(X.columns) - set(test_X.columns)
missing_columns

set()

In [19]:
for col in missing_columns:
    test_X[col] = 0

In [20]:
X.shape, test_X.shape

((52780, 22), (5865, 23))

In [21]:
# Define regression models
reg_models = [
    # LGBMRegressor(n_jobs=-1, random_state=5, objective='regression'),
    LGBMClassifier(n_jobs=-1, random_state=5, objective='binary'),
]

In [22]:
def reg_evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train MAE', 
                                        'Model Test MAE', 
                                        'Model Test MAE Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train MAE': 0,
                'Model Test MAE': 0,
                'Model Test MAE Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc',
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train MAE': cv_results['train_score'].mean(),
            'Model Test MAE': cv_results['test_score'].mean(),
            'Model Test MAE Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test MAE'], ascending=False, inplace=True)
    # model_compare.to_csv(f'{experiment_name}_reg_combined.csv', index=False)

    return model_compare

In [24]:
baseline_features_reg = {}

for model in reg_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features_reg[model_name] = list(X.columns)

In [25]:
%%time

baseline_models_reg = reg_evaluate_models(reg_models, X, y, baseline_features_reg, k3, f'{experiment_name}')
baseline_models_reg

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 188 ms
Wall time: 7.34 s


Unnamed: 0,Model Name,Model Parameters,Model Train MAE,Model Test MAE,Model Test MAE Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.978123,0.955275,0.00255,0 min 0.75 sec


# Validation Test

In [26]:
model = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary')

model.fit(X, y)

In [28]:
# Obtain gain feature importance
gain_importance = model.feature_importances_

# Display feature importance with feature names
feature_names = X.columns
gain_importance_df = pd.DataFrame({'Feature': feature_names, 'Gain': gain_importance})
gain_importance_df.sort_values(by='Gain', ascending=False).head(5)

Unnamed: 0,Feature,Gain
1,person_income,941
4,loan_int_rate,414
5,loan_percent_income,215
3,loan_amnt,205
2,person_emp_length,185


In [31]:
pred = model.predict_proba(test_X[X.columns])[:, 1]
pred_df = pd.DataFrame(pred, columns=[TARGET])
pred_df.head()

Unnamed: 0,loan_status
0,0.018256
1,0.108971
2,0.001927
3,0.014481
4,0.004412


In [35]:
val_roc = roc_auc_score(test_X[TARGET], pred_df)
print(f"ROC AUC is {val_roc:.4f}")

ROC AUC is 0.9554


# Autogluon

In [36]:
autogluon_data = pd.concat([X, y], axis=1)
autogluon_data.shape

(52780, 23)

In [37]:
predictor = TabularPredictor(problem_type='binary', 
                             label=TARGET, 
                             eval_metric='roc_auc', 
                             verbosity=4)

predictor.fit(train_data=autogluon_data,
                # presets='good_quality',
                # presets=['optimize_for_deployment'],
                # feature_prune_kwargs={},
                time_limit= 5 * 60,
)

No path specified. Models will be saved in: "AutogluonModels\ag-20241004_194801\"
User Specified kwargs:
{'feature_prune_kwargs': {}}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': False,
 'calibrate': 'auto',
 'excluded_model_types': None,
 'feature_generator': 'auto',
 'feature_prune_kwargs': {},
 'holdout_frac': None,
 'hyperparameter_tune_kwargs': None,
 'keep_only_best': False,
 'name_suffix': None,
 'num_bag_folds': None,
 'num_bag_sets': None,
 'num_stack_levels': None,
 'pseudo_data': None,
 'refit_full': False,
 'save_space': False,
 'set_best_to_refit_full': False,
 'unlabeled_data': None,
 'use_bag_holdout': False,
 'verbosity': 4}
Saving AutogluonModels\ag-20241004_194801\learner.pkl
Saving AutogluonModels\ag-20241004_194801\predictor.pkl
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels\ag-20241004_194801\"
AutoGluon 

[1]	valid_set's binary_logloss: 0.387589
[2]	valid_set's binary_logloss: 0.372506
[3]	valid_set's binary_logloss: 0.358669
[4]	valid_set's binary_logloss: 0.341903
[5]	valid_set's binary_logloss: 0.330382
[6]	valid_set's binary_logloss: 0.323329
[7]	valid_set's binary_logloss: 0.314807
[8]	valid_set's binary_logloss: 0.307175
[9]	valid_set's binary_logloss: 0.299479
[10]	valid_set's binary_logloss: 0.294046
[11]	valid_set's binary_logloss: 0.287123
[12]	valid_set's binary_logloss: 0.281429
[13]	valid_set's binary_logloss: 0.276754
[14]	valid_set's binary_logloss: 0.271478
[15]	valid_set's binary_logloss: 0.2675
[16]	valid_set's binary_logloss: 0.262768
[17]	valid_set's binary_logloss: 0.259953
[18]	valid_set's binary_logloss: 0.254522
[19]	valid_set's binary_logloss: 0.25033
[20]	valid_set's binary_logloss: 0.246902
[21]	valid_set's binary_logloss: 0.244469
[22]	valid_set's binary_logloss: 0.241234
[23]	valid_set's binary_logloss: 0.239025
[24]	valid_set's binary_logloss: 0.236706
[25]

Saving AutogluonModels\ag-20241004_194801\models\LightGBMXT\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\LightGBMXT\y_pred_proba_val.pkl
	0.9351	 = Validation score   (roc_auc)
	4.46s	 = Training   runtime
	0.07s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: LightGBM ... Training model for up to 286.97s of the 286.95s of remaining time.
	Dropped 0 of 22 features.
	Fitting LightGBM with 'num_gpus': 0, 'num_cpus': 4
	Fitting 10000 rounds... Hyperparameters: {'learning_rate': 0.05}


[1]	valid_set's binary_logloss: 0.380326
[2]	valid_set's binary_logloss: 0.358274
[3]	valid_set's binary_logloss: 0.340475
[4]	valid_set's binary_logloss: 0.325796
[5]	valid_set's binary_logloss: 0.31317
[6]	valid_set's binary_logloss: 0.302187
[7]	valid_set's binary_logloss: 0.292463
[8]	valid_set's binary_logloss: 0.283812
[9]	valid_set's binary_logloss: 0.275959
[10]	valid_set's binary_logloss: 0.26899
[11]	valid_set's binary_logloss: 0.262471
[12]	valid_set's binary_logloss: 0.256666
[13]	valid_set's binary_logloss: 0.251517
[14]	valid_set's binary_logloss: 0.24652
[15]	valid_set's binary_logloss: 0.241785
[16]	valid_set's binary_logloss: 0.237636
[17]	valid_set's binary_logloss: 0.233572
[18]	valid_set's binary_logloss: 0.229923
[19]	valid_set's binary_logloss: 0.226589
[20]	valid_set's binary_logloss: 0.223467
[21]	valid_set's binary_logloss: 0.220295
[22]	valid_set's binary_logloss: 0.217414
[23]	valid_set's binary_logloss: 0.214742
[24]	valid_set's binary_logloss: 0.212371
[25]

Saving AutogluonModels\ag-20241004_194801\models\LightGBM\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\LightGBM\y_pred_proba_val.pkl
	0.9574	 = Validation score   (roc_auc)
	2.02s	 = Training   runtime
	0.03s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: RandomForestGini ... Training model for up to 284.83s of the 284.81s of remaining time.
	Dropped 0 of 22 features.
	Fitting RandomForestGini with 'num_gpus': 0, 'num_cpus': 8
Saving AutogluonModels\ag-20241004_194801\models\RandomForestGini\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\RandomForestGini\y_pred_proba_val.pkl
	0.9408	 = Validation score   (roc_auc)
	6.48s	 = Training   runtime
	0.18s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: RandomForestEntr ... Training model for up to 277.53s of the 277.51s of remaining time.
	Dropped 0 of 22 features.
	Fitting RandomForestEntr with 'num_gpus': 0

0:	learn: 0.6297041	test: 0.6305439	best: 0.6305439 (0)	total: 172ms	remaining: 28m 37s
1:	learn: 0.5738859	test: 0.5749111	best: 0.5749111 (1)	total: 191ms	remaining: 15m 54s
2:	learn: 0.5251338	test: 0.5264642	best: 0.5264642 (2)	total: 212ms	remaining: 11m 46s
3:	learn: 0.4825818	test: 0.4844179	best: 0.4844179 (3)	total: 232ms	remaining: 9m 38s
4:	learn: 0.4454567	test: 0.4476618	best: 0.4476618 (4)	total: 247ms	remaining: 8m 14s
5:	learn: 0.4146453	test: 0.4172829	best: 0.4172829 (5)	total: 259ms	remaining: 7m 10s
6:	learn: 0.3873001	test: 0.3904600	best: 0.3904600 (6)	total: 274ms	remaining: 6m 31s
7:	learn: 0.3630134	test: 0.3665332	best: 0.3665332 (7)	total: 288ms	remaining: 5m 59s
8:	learn: 0.3426856	test: 0.3466361	best: 0.3466361 (8)	total: 299ms	remaining: 5m 32s
9:	learn: 0.3246723	test: 0.3290091	best: 0.3290091 (9)	total: 312ms	remaining: 5m 12s
10:	learn: 0.3091731	test: 0.3138680	best: 0.3138680 (10)	total: 328ms	remaining: 4m 57s
11:	learn: 0.2965135	test: 0.3016260	b

Saving AutogluonModels\ag-20241004_194801\models\CatBoost\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\CatBoost\y_pred_proba_val.pkl
	0.9578	 = Validation score   (roc_auc)
	38.24s	 = Training   runtime
	0.01s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: ExtraTreesGini ... Training model for up to 232.32s of the 232.3s of remaining time.
	Dropped 0 of 22 features.
	Fitting ExtraTreesGini with 'num_gpus': 0, 'num_cpus': 8


2052:	learn: 0.0951128	test: 0.1542059	best: 0.1538481 (1563)	total: 30s	remaining: 1m 56s

bestTest = 0.1538481228
bestIteration = 1563

Shrink model to first 1564 iterations.


Saving AutogluonModels\ag-20241004_194801\models\ExtraTreesGini\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\ExtraTreesGini\y_pred_proba_val.pkl
	0.9259	 = Validation score   (roc_auc)
	2.82s	 = Training   runtime
	0.13s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: ExtraTreesEntr ... Training model for up to 228.72s of the 228.7s of remaining time.
	Dropped 0 of 22 features.
	Fitting ExtraTreesEntr with 'num_gpus': 0, 'num_cpus': 8
Saving AutogluonModels\ag-20241004_194801\models\ExtraTreesEntr\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\ExtraTreesEntr\y_pred_proba_val.pkl
	0.9235	 = Validation score   (roc_auc)
	2.67s	 = Training   runtime
	0.1s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: NeuralNetFastAI ... Training model for up to 225.42s of the 225.41s of remaining time.
	Dropped 0 of 22 features.
	Fitting NeuralNetFastAI with 'num_gpus': 0

[0]	validation_0-logloss:0.61767
[1]	validation_0-logloss:0.55611
[2]	validation_0-logloss:0.50501
[3]	validation_0-logloss:0.46215
[4]	validation_0-logloss:0.42579
[5]	validation_0-logloss:0.39491
[6]	validation_0-logloss:0.36835
[7]	validation_0-logloss:0.34562
[8]	validation_0-logloss:0.32570
[9]	validation_0-logloss:0.30827
[10]	validation_0-logloss:0.29301
[11]	validation_0-logloss:0.27989
[12]	validation_0-logloss:0.26830
[13]	validation_0-logloss:0.25782
[14]	validation_0-logloss:0.24879
[15]	validation_0-logloss:0.24077
[16]	validation_0-logloss:0.23399
[17]	validation_0-logloss:0.22814
[18]	validation_0-logloss:0.22278
[19]	validation_0-logloss:0.21793
[20]	validation_0-logloss:0.21364
[21]	validation_0-logloss:0.20996
[22]	validation_0-logloss:0.20563
[23]	validation_0-logloss:0.20206
[24]	validation_0-logloss:0.19930
[25]	validation_0-logloss:0.19700
[26]	validation_0-logloss:0.19502
[27]	validation_0-logloss:0.19292
[28]	validation_0-logloss:0.19107
[29]	validation_0-loglos

Saving AutogluonModels\ag-20241004_194801\models\XGBoost\model.pkl
Saving AutogluonModels\ag-20241004_194801\utils\attr\XGBoost\y_pred_proba_val.pkl
	0.955	 = Validation score   (roc_auc)
	10.12s	 = Training   runtime
	0.06s	 = Validation runtime
Saving AutogluonModels\ag-20241004_194801\models\trainer.pkl
Fitting model: NeuralNetTorch ... Training model for up to 40.4s of the 40.38s of remaining time.
	Dropped 0 of 22 features.
	Fitting NeuralNetTorch with 'num_gpus': 0, 'num_cpus': 4
Tabular Neural Network treats features as the following types:
{
    "continuous": [
        "loan_int_rate",
        "loan_percent_income"
    ],
    "skewed": [
        "person_age",
        "person_income",
        "person_emp_length",
        "loan_amnt",
        "cb_person_cred_hist_length"
    ],
    "onehot": [],
    "embed": [],
    "language": [],
    "bool": [
        "person_home_ownership_other",
        "person_home_ownership_own",
        "person_home_ownership_rent",
        "loan_intent_e

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x19eb5cbc4c0>

In [38]:
predictor.leaderboard()

                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.963046       0.531133  240.354493                0.001980           1.675946            2       True         13
1              CatBoost   0.957810       0.007685   38.238526                0.007685          38.238526            1       True          7
2              LightGBM   0.957442       0.033511    2.023470                0.033511           2.023470            1       True          4
3               XGBoost   0.954967       0.060253   10.120573                0.060253          10.120573            1       True         11
4      RandomForestGini   0.940784       0.184987    6.483560                0.184987           6.483560            1       True          5
5      RandomForestEntr   0.939633       0.224730    6.011616                0.224730           6.011616            1       True          6
6            LightGB

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.963046,0.531133,240.354493,0.00198,1.675946,2,True,13
1,CatBoost,0.95781,0.007685,38.238526,0.007685,38.238526,1,True,7
2,LightGBM,0.957442,0.033511,2.02347,0.033511,2.02347,1,True,4
3,XGBoost,0.954967,0.060253,10.120573,0.060253,10.120573,1,True,11
4,RandomForestGini,0.940784,0.184987,6.48356,0.184987,6.48356,1,True,5
5,RandomForestEntr,0.939633,0.22473,6.011616,0.22473,6.011616,1,True,6
6,LightGBMXT,0.935075,0.070366,4.463126,0.070366,4.463126,1,True,3
7,ExtraTreesGini,0.925897,0.132724,2.819449,0.132724,2.819449,1,True,8
8,ExtraTreesEntr,0.923458,0.096104,2.670124,0.096104,2.670124,1,True,9
9,NeuralNetFastAI,0.923029,0.186339,174.335777,0.186339,174.335777,1,True,10


In [39]:
# Evaluate various metrics
predictor.evaluate(test_X)

Loading: AutogluonModels\ag-20241004_194801\models\CatBoost\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\KNeighborsUnif\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\LightGBM\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\NeuralNetFastAI\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\NeuralNetFastAI\model-internals.pkl
Loading: AutogluonModels\ag-20241004_194801\models\RandomForestGini\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\XGBoost\model.pkl
Loading: AutogluonModels\ag-20241004_194801\models\WeightedEnsemble_L2\model.pkl
Evaluation: roc_auc on test data: 0.9569495601242842
Evaluations on test data:
{
    "roc_auc": 0.9569495601242842,
    "accuracy": 0.9515771526001705,
    "balanced_accuracy": 0.8569076558612398,
    "mcc": 0.7898021300266573,
    "f1": 0.8099062918340028,
    "precision": 0.9180576631259484,
    "recall": 0.7245508982035929
}


{'roc_auc': 0.9569495601242842,
 'accuracy': 0.9515771526001705,
 'balanced_accuracy': 0.8569076558612398,
 'mcc': 0.7898021300266573,
 'f1': 0.8099062918340028,
 'precision': 0.9180576631259484,
 'recall': 0.7245508982035929}