VISUALIZATION OF GRIDSEARCH WITH IN CATBOOST

In [1]:
#Gridsearchcv and Randomizedsearchcv - let's use a combined validation that combines random partitioning into training and test samples and k-block cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                    KFold,
                                    ParameterGrid,
                                    cross_val_score,
                                    GridSearchCV,
                                    RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline
from category_encoders import WOEEncoder, SumEncoder
from tqdm import tqdm_notebook
from datasets import load_dataset
from tqdm import tqdm
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

import catboost
import xgboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from xgboost import XGBClassifier
import nbformat
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

In [2]:
#https://huggingface.co/datasets/imodels/credit-card
dataset = load_dataset("imodels/credit-card")['train']


In [3]:
dataset = pd.DataFrame(dataset)
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [4]:
dataset['default.payment.next.month'].value_counts() #-> not fine distribution

default.payment.next.month
0    18677
1     5323
Name: count, dtype: int64

In [5]:
#Add less class - important. Shows better results for GradientBoostingClassifier, but worse for LR
diff = len(dataset.loc[dataset['default.payment.next.month']==0])//len(dataset.loc[dataset['default.payment.next.month']==1])
df_1 = dataset.loc[dataset['default.payment.next.month']==1]
df_1 = df_1.loc[df_1.index.repeat(diff)]
datasetnew = pd.concat([dataset.loc[dataset['default.payment.next.month']==0], df_1]).sample(frac=1)

In [6]:
datasetnew['default.payment.next.month'].value_counts() 

default.payment.next.month
0    18677
1    15969
Name: count, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
        datasetnew.drop('default.payment.next.month', axis=1),
        datasetnew['default.payment.next.month'],
        test_size=0.2,
        stratify=datasetnew['default.payment.next.month'],
        random_state=42)

In [8]:
#Categorical and numerical columns
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()


In [9]:
cat_columns

[]

In [11]:
#Pipeline for num and cat
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

transformer = ColumnTransformer(transformers=transformers)


#Pipeline for LR
ml_pipe_LR = Pipeline([
    ('tf', transformer),
    ('logreg', LogisticRegression(solver='lbfgs',
    max_iter=200))
])

#Pipeline for GBC
ml_pipe_GBC = Pipeline([
    ('preprocessor', transformer),
    ('classifier', GradientBoostingClassifier(
    random_state=42, subsample=0.8))
])

#Pipeline for ridge
ml_pipe_ridge = Pipeline([
    ('preprocessor', transformer),
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(2, interaction_only=True, include_bias=False)),  #3 works better, ROC-AUC show bad results
    ('model', Ridge())
])


#Pipeline for Catboost
cb_params = {'verbose': False,
             'task_type': "GPU",
             'boost_from_average': True,
             'loss_function': 'Logloss'
            }
ml_pipe_catboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', CatBoostClassifier(**cb_params))
])


cb_best_params = {'learning_rate': 0.1,
                  'depth': 10,
                  'task_type': "GPU",
                  'boost_from_average': True,
                  'loss_function': 'Logloss',
                  'l2_leaf_reg': 7
#                       'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
                    }


#Pipeline for XGboost

xgb_params = {#'n_estimators': 280,
              #'learning_rate': 0.05,
              #'max_depth': 10,
              'subsample': 1.0,
              'colsample_bytree': 1.0,
              '@tree_method': 'hist',
              'enable_categorical': True,
              'verbosity': 1,
              'min_child_weight': 3,
              'random_state': 1}

ml_pipe_xgboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', XGBClassifier(**xgb_params))
])

#Gridsearch parameters
param_grid_LR = {
    'tf__num__imp__strategy': ['mean', 'median', 'constant'],
    'tf__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': [.01, .1, .5, 1, 5, 10, 100]
}

param_grid_GBC = [{'classifier__max_depth': [2, 4],
                   'classifier__n_estimators': [50, 100]}]

param_grid_Ridge = {
    'model__alpha': np.arange(0, 0.2, 0.01)}

#from here - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_grid_search     
#HARD           
#param_grid_catboost = {'learning_rate': [0.03, 0.1],
#                       'depth': [4, 6, 10],
#                       'l2_leaf_reg': [1, 3, 5, 7, 9]
#                    }
#EASY
param_grid_catboost = {'learning_rate': [0.08, 0.1],
                       'depth': [9, 10],
                       'l2_leaf_reg': [7, 9]
                    }



param_grid_xgboost = {
     'max_depth': range (2, 10, 2),
     'max_depth': [2, 4],
     'n_estimators': range(100, 200, 50),
     'learning_rate': [0.1, 0.01, 0.05]
}


CATBOOST

In [12]:
#Test gridsearch Catboost
#https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit
#https://catboost.ai/en/docs/concepts/python-reference_catboost_grid_search
Modelcb = CatBoostClassifier(**cb_params)
Modelcb.grid_search(param_grid_catboost, 
                                       X=X_train, 
                                       y=y_train,                                       
                                       plot=True)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.4626023594
bestIteration = 995
0:	loss: 0.4626024	best: 0.4626024 (0)	total: 2m 8s	remaining: 15m 1s
bestTest = 0.4530548932
bestIteration = 999
1:	loss: 0.4530549	best: 0.4530549 (1)	total: 4m 14s	remaining: 12m 42s
bestTest = 0.4522164305
bestIteration = 999
2:	loss: 0.4522164	best: 0.4522164 (2)	total: 6m 27s	remaining: 10m 45s
bestTest = 0.4486076498
bestIteration = 998
3:	loss: 0.4486076	best: 0.4486076 (3)	total: 8m 38s	remaining: 8m 38s
bestTest = 0.4424684203
bestIteration = 995
4:	loss: 0.4424684	best: 0.4424684 (4)	total: 12m 22s	remaining: 7m 25s
bestTest = 0.4374462309
bestIteration = 999
5:	loss: 0.4374462	best: 0.4374462 (5)	total: 16m 17s	remaining: 5m 25s
bestTest = 0.4447747653
bestIteration = 997
6:	loss: 0.4447748	best: 0.4374462 (5)	total: 20m 17s	remaining: 2m 53s
bestTest = 0.4381181461
bestIteration = 999
7:	loss: 0.4381181	best: 0.4374462 (5)	total: 24m 12s	remaining: 0us
Estimating final quality...
Training on fold [0/3]
bestTest = 0.4368039878
bes

{'params': {'depth': 10, 'l2_leaf_reg': 7, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [14]:
display(Modelcb.best_score_)
display(Modelcb.is_fitted())
display(Modelcb.get_params())
display(Modelcb.get_all_params())
display(Modelcb.score(X_test, y_test))

{'learn': {'Logloss': 0.312197510857898}}

True

{'loss_function': 'Logloss',
 'verbose': False,
 'task_type': 'GPU',
 'boost_from_average': True,
 'depth': 10,
 'l2_leaf_reg': 7,
 'learning_rate': 0.1}

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'Logloss',
 'iterations': 1000,
 'fold_permutation_block': 64,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Ordered',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '-1',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 7,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0, 1],
 'random_seed': 0,
 'depth': 10,
 'has_time': False,
 'fold_len_multiplier': 2,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'FeaturePa

0.8106782106782107

In [15]:
#Save
Modelcb.save_model('Modelcb',
                    format="json",
                    export_parameters=None,
                    pool=None)

#Load
Modelcbloaded = CatBoostClassifier()
Modelcbloaded.load_model('Modelcb', format='json')

<catboost.core.CatBoostClassifier at 0x1f57dda63d0>

In [41]:
#Statistics per feature. Mean prediction is betweein 0 and 1
res = Modelcbloaded.calc_feature_statistics(X_train,
                                    y_train,
                                    feature=11,
                                    plot=True)

XGBOOST

In [17]:
#Test gridsearch XGboost
#https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
#scoring - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
Modelxgboost = XGBClassifier(**xgb_params)

grid_search_xgb = GridSearchCV(Modelxgboost, param_grid_xgboost, cv=5, scoring='roc_auc', return_train_score=True, verbose=True, n_jobs = 10)
grid_search_xgb.fit(X_train, y_train)

display(grid_search_xgb.best_estimator_)
display(grid_search_xgb.best_params_)
display(grid_search_xgb.best_score_)

Modelxgboost = grid_search_xgb.best_estimator_.fit(X_train, y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits



is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


Parameters: { "@tree_method" } are not used.




{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150}

0.8091721035271882


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


Parameters: { "@tree_method" } are not used.




#Use and test model

In [18]:
#Use better model and parametres
#ModelLR = pipe_ML(ml_pipe_LR, param_grid_LR)[1]
Modelcb.score(X_test, y_test)

0.8106782106782107

In [19]:
#ModelGBC = pipe_ML(ml_pipe_GBC, param_grid_GBC)[1]
Modelxgboost.score(X_test, y_test)


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



0.7314574314574315

In [22]:
#Predict data that must give 1
test1 = dataset[dataset['default.payment.next.month']==1].iloc[[11]].drop('default.payment.next.month', axis=1)
display(Modelcb.predict(test1)[0])
display(Modelxgboost.predict(test1)[0])

1


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



0

In [25]:
data = dataset[dataset['default.payment.next.month']==1]
data = data.drop('default.payment.next.month', axis = 1)
data

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:1,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3
10,220000.0,38.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22145.0,5529.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,100000.0,27.0,-1.0,2.0,0.0,0.0,0.0,0.0,17553.0,10628.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,80000.0,30.0,2.0,0.0,0.0,-1.0,-1.0,-2.0,4794.0,4989.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14,120000.0,61.0,1.0,2.0,0.0,0.0,0.0,0.0,121709.0,78369.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19,140000.0,24.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,16343.0,1462.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23992,280000.0,32.0,1.0,-2.0,-1.0,0.0,0.0,-2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23993,160000.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,165686.0,169969.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
resultCatboost = pd.DataFrame(Modelcb.predict(data))
resultCatboost.value_counts()

1    4523
0     800
Name: count, dtype: int64

In [27]:
resultxgboost = pd.DataFrame(Modelxgboost.predict(data))
resultxgboost.value_counts()


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



1    3414
0    1909
Name: count, dtype: int64

In [28]:
#From saved model
resultCatboostloaded = pd.DataFrame(Modelcbloaded.predict(data))
resultCatboostloaded.value_counts()

1    4523
0     800
Name: count, dtype: int64

Work with best model -> Catboost and best hyperparameters

In [None]:
dataset

Pipeline with catboost

In [29]:
#Then try bootstrap_type in catboost parameters. Use better hyperparameters 'learning_rate': [0.1], 'depth': [10], 'l2_leaf_reg': [7].
#https://makesomecode.me/2021/11/intro-to-optuna/
#without this method of scaling

estimator = CatBoostClassifier(**cb_best_params, verbose=False)

CB_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('classifier', estimator)
])
CB_pipeline.fit(X_train, y_train)

In [30]:
display(CB_pipeline.score(X_test, y_test))

0.8124098124098124

In [31]:
data1 = dataset[dataset['default.payment.next.month']==1]
data1 = data1.drop('default.payment.next.month', axis = 1)

In [32]:
resultCatboostpipe = pd.DataFrame(CB_pipeline.predict(data1))
resultCatboostpipe.value_counts()

1    4478
0     845
Name: count, dtype: int64

In [33]:
data0 = dataset[dataset['default.payment.next.month']==0]
data0 = data0.drop('default.payment.next.month', axis = 1)

In [34]:
resultCatboostpipe = pd.DataFrame(CB_pipeline.predict(data0))
resultCatboostpipe.value_counts()

0    16534
1     2143
Name: count, dtype: int64

CLASS BALANCING WITH CLASS WEIGHTS

In [44]:
from sklearn.utils.class_weight import compute_class_weight

In [72]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
        dataset.drop('default.payment.next.month', axis=1),
        dataset['default.payment.next.month'],
        test_size=0.2,
        stratify=dataset['default.payment.next.month'],
        random_state=42)

In [73]:
#Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train1), y=y_train1)

In [74]:
class_weights

array([0.64248427, 2.25457961])

In [75]:
estimator1 = CatBoostClassifier(**cb_best_params, verbose=False, class_weights=dict(enumerate(class_weights)))

CB_pipeline1 = Pipeline([
    ('preprocessor', transformer),
    ('classifier', estimator1)
])
CB_pipeline1.fit(X_train1, y_train1)

In [76]:
#Lower with not multipled dataset, using only computed weights
display(CB_pipeline1.score(X_test1, y_test1))

0.7764583333333334

In [77]:
y_test1.value_counts() 

default.payment.next.month
0    3735
1    1065
Name: count, dtype: int64

In [78]:
#with multiplied data
display(CB_pipeline1.score(X_test, y_test))

0.8867243867243867

In [79]:
y_test.value_counts() 

default.payment.next.month
0    3736
1    3194
Name: count, dtype: int64

In [80]:
resultCatboostpipe1 = pd.DataFrame(CB_pipeline1.predict(data0))
resultCatboostpipe1.value_counts()

0    16881
1     1796
Name: count, dtype: int64

In [82]:
#With static weights distribution (like the result code in the beginning for datasetnew 1/3):

class_weights2 = {
    0: 1.0,  
    1: 3.0,  #Lower the class, bigger the weight
}

In [85]:
estimator2 = CatBoostClassifier(**cb_best_params, verbose=False, class_weights=class_weights2)

CB_pipeline2 = Pipeline([
    ('preprocessor', transformer),
    ('classifier', estimator2)
])
CB_pipeline2.fit(X_train1, y_train1)

In [86]:
display(CB_pipeline2.score(X_test1, y_test1))

0.79

In [87]:
#with multiplied data
display(CB_pipeline2.score(X_test, y_test))

0.8658008658008658

In [None]:
# It works - the score is even better with imputed data (was 0.8124098124098124, now 0.8867243867243867). The result with class_weights2 is worse (0.8658008658008658). So better use compute_class_weight.
#But the score is worse, because test data is unbalanced...

# So may be better to Oversample or Undersample data - this in next video