VISUALIZATION OF GRIDSEARCH WITH TQDM, USING PIPELINES AND SEVERAL METHODS

In [1]:
#Gridsearchcv and Randomizedsearchcv - let's use a combined validation that combines random partitioning into training and test samples and k-block cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                    KFold,
                                    ParameterGrid,
                                    cross_val_score,
                                    GridSearchCV,
                                    RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline
from category_encoders import WOEEncoder, SumEncoder
from tqdm import tqdm_notebook
from datasets import load_dataset
from tqdm import tqdm
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

import catboost
import xgboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from xgboost import XGBClassifier
import nbformat

In [2]:
#https://huggingface.co/datasets/mstz/speeddating
#dataset = load_dataset("mstz/speeddating")["train"]
#https://huggingface.co/datasets/imodels/credit-card
dataset = load_dataset("imodels/credit-card")['train']


In [3]:
dataset = pd.DataFrame(dataset)
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [4]:
dataset['default.payment.next.month'].value_counts() #-> not fine distribution

default.payment.next.month
0    18677
1     5323
Name: count, dtype: int64

In [5]:
#Add less class - important. Shows better results for GradientBoostingClassifier, but worse for LR
diff = len(dataset.loc[dataset['default.payment.next.month']==0])//len(dataset.loc[dataset['default.payment.next.month']==1])
df_1 = dataset.loc[dataset['default.payment.next.month']==1]
df_1 = df_1.loc[df_1.index.repeat(diff)]
datasetnew = pd.concat([dataset.loc[dataset['default.payment.next.month']==0], df_1]).sample(frac=1)

In [6]:
datasetnew['default.payment.next.month'].value_counts() 

default.payment.next.month
0    18677
1    15969
Name: count, dtype: int64

In [134]:
X_train, X_test, y_train, y_test = train_test_split(
        datasetnew.drop('default.payment.next.month', axis=1),
        datasetnew['default.payment.next.month'],
        test_size=0.2,
        stratify=datasetnew['default.payment.next.month'],
        random_state=42)

In [8]:
#Categorical and numerical columns
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()


In [9]:
cat_columns

[]

In [121]:
#Pipeline for num and cat
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

transformer = ColumnTransformer(transformers=transformers)


#Pipeline for LR
ml_pipe_LR = Pipeline([
    ('tf', transformer),
    ('logreg', LogisticRegression(solver='lbfgs',
    max_iter=200))
])

#Pipeline for GBC
ml_pipe_GBC = Pipeline([
    ('preprocessor', transformer),
    ('classifier', GradientBoostingClassifier(
    random_state=42, subsample=0.8))
])

#Pipeline for ridge
ml_pipe_ridge = Pipeline([
    ('preprocessor', transformer),
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(2, interaction_only=True, include_bias=False)),  #3 works better, ROC-AUC show bad results
    ('model', Ridge())
])


#Pipeline for Catboost
cb_params = {'verbose': False,
             'task_type': "GPU",
             'boost_from_average': True,
             'loss_function': 'Logloss'
            }
ml_pipe_catboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', CatBoostClassifier(**cb_params))
])


cb_best_params = {'learning_rate': 0.1,
                  'depth': 10,
                  'task_type': "GPU",
                  'boost_from_average': True,
                  'loss_function': 'Logloss',
                  'l2_leaf_reg': 7
#                       'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
                    }


#Pipeline for XGboost

xgb_params = {#'n_estimators': 280,
              #'learning_rate': 0.05,
              #'max_depth': 10,
              'subsample': 1.0,
              'colsample_bytree': 1.0,
              '@tree_method': 'hist',
              'enable_categorical': True,
              'verbosity': 1,
              'min_child_weight': 3,
              'random_state': 1}

ml_pipe_xgboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', XGBClassifier(**xgb_params))
])

#Gridsearch parameters
param_grid_LR = {
    'tf__num__imp__strategy': ['mean', 'median', 'constant'],
    'tf__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': [.01, .1, .5, 1, 5, 10, 100]
}

param_grid_GBC = [{'classifier__max_depth': [2, 4],
                   'classifier__n_estimators': [50, 100]}]

param_grid_Ridge = {
    'model__alpha': np.arange(0, 0.2, 0.01)}

#from here - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_grid_search                
param_grid_catboost = {'learning_rate': [0.03, 0.1],
                       'depth': [4, 6, 10],
                       'l2_leaf_reg': [1, 3, 5, 7, 9]
                    }

param_grid_xgboost = {
     'max_depth': range (2, 10, 2),
     'max_depth': [2, 4],
     'n_estimators': range(100, 200, 50),
     'learning_rate': [0.1, 0.01, 0.05]
}


In [11]:
def pipe_ML(pipe, grid):
       scores = []
       scoring = 'roc_auc' 
       print("Looking for best model hyperparameters...")

       #Use tqdm and ParameterGrid

       for param in tqdm(list(ParameterGrid(grid)),
                            desc='Done'):

              pipe.set_params(**param)

              scores.append([param, cross_val_score(pipe,
                                                        X_train,
                                                        y_train,
                                                        scoring=scoring,
                                                        cv=5)])

              scores[-1].append(sum(scores[-1][1]) / len(scores[-1][1]))

       scores.sort(reverse=True, key=lambda x: x[2])

       best_params = scores[0][0]
       print("Best hyperparameters:",
              best_params, sep='\n', end='\n')

       best_score = scores[0][2]
       print("Best meaning is %s: %.3f" % (scoring, best_score))

       model = pipe.set_params(**best_params).fit(X_train, y_train)

       test_score = pipe.score(X_test, y_test)

       print("Meaning %s on test: %.3f" % (scoring, test_score))
       
       return ({best_score: best_params}, model) 

In [12]:
#2 Works:
#grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC)]
#grid_dict = {0: 'LR', 1: 'GBC'}

#only ridge
#grids = [(ml_pipe_ridge, param_grid_Ridge)]
#grid_dict = {0: 'Ridge'}

#only Catboost - did not work with gridsearch!?
#grids = [(ml_pipe_catboost, param_grid_catboost)]
#grid_dict = {0: 'Catboost'}

#only XGboost
#grids = [(ml_pipe_xgboost, param_grid_xgboost)]
#grid_dict = {0: 'xgboost'}


#3 works, no catboost
grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC), (ml_pipe_ridge, param_grid_Ridge)]
grid_dict = {0: 'LR', 1: 'GBC', 2: 'Ridge'}



In [13]:
auc_list = []
best_clf = 0
for idx, gs in enumerate(grids):
    print("Method: %s" % grid_dict[idx])
    #pipe_ML(grids[idx][0], grids[idx][1])
    exec(f"Model{grid_dict[idx]} = pipe_ML(grids[idx][0], grids[idx][1])[1]")
    print()

Method: LR
Looking for best model hyperparameters...


Done: 100%|██████████| 42/42 [00:26<00:00,  1.61it/s]


Best hyperparameters:
{'logreg__C': 5, 'tf__cat__imp__strategy': 'most_frequent', 'tf__num__imp__strategy': 'mean'}
Best meaning is roc_auc: 0.725
Meaning roc_auc on test: 0.692

Method: GBC
Looking for best model hyperparameters...


Done: 100%|██████████| 4/4 [02:30<00:00, 37.64s/it]


Best hyperparameters:
{'classifier__max_depth': 4, 'classifier__n_estimators': 100}
Best meaning is roc_auc: 0.808
Meaning roc_auc on test: 0.730

Method: Ridge
Looking for best model hyperparameters...


Done: 100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


Best hyperparameters:
{'model__alpha': 0.01}
Best meaning is roc_auc: 0.749
Meaning roc_auc on test: -1.170



CATBOOST

In [14]:
#Test gridsearch Catboost
#https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_fit
#https://catboost.ai/en/docs/concepts/python-reference_catboost_grid_search
Modelcb = CatBoostClassifier(**cb_params)
Modelcb.grid_search(param_grid_catboost, 
                                       X=X_train, 
                                       y=y_train,                                       
                                       plot=True)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.5361964018
bestIteration = 999
0:	loss: 0.5361964	best: 0.5361964 (0)	total: 24.9s	remaining: 12m 1s
bestTest = 0.519305693
bestIteration = 960
1:	loss: 0.5193057	best: 0.5193057 (1)	total: 49.6s	remaining: 11m 34s
bestTest = 0.536768089
bestIteration = 999
2:	loss: 0.5367681	best: 0.5193057 (1)	total: 1m 14s	remaining: 11m 13s
bestTest = 0.5191778978
bestIteration = 999
3:	loss: 0.5191779	best: 0.5191779 (3)	total: 1m 39s	remaining: 10m 46s
bestTest = 0.5373810387
bestIteration = 999
4:	loss: 0.5373810	best: 0.5191779 (3)	total: 2m 4s	remaining: 10m 21s
bestTest = 0.5181617021
bestIteration = 999
5:	loss: 0.5181617	best: 0.5181617 (5)	total: 2m 29s	remaining: 9m 56s
bestTest = 0.5374153875
bestIteration = 999
6:	loss: 0.5374154	best: 0.5181617 (5)	total: 2m 53s	remaining: 9m 30s
bestTest = 0.5234199733
bestIteration = 996
7:	loss: 0.5234200	best: 0.5181617 (5)	total: 3m 18s	remaining: 9m 5s
bestTest = 0.5381173799
bestIteration = 999
8:	loss: 0.5381174	best: 0.5181617 (5)

{'params': {'depth': 10, 'l2_leaf_reg': 7, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [15]:
Modelcb.get_best_iteration()

In [16]:
display(Modelcb.best_score_)
display(Modelcb.is_fitted())
display(Modelcb.get_params())
display(Modelcb.get_all_params())
display(Modelcb.score(X_test, y_test))

{'learn': {'Logloss': 0.2931492392988256}}

True

{'loss_function': 'Logloss',
 'verbose': False,
 'task_type': 'GPU',
 'boost_from_average': True,
 'depth': 10,
 'l2_leaf_reg': 7,
 'learning_rate': 0.1}

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'Logloss',
 'iterations': 1000,
 'fold_permutation_block': 64,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Ordered',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '-1',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 7,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0, 1],
 'random_seed': 0,
 'depth': 10,
 'has_time': False,
 'fold_len_multiplier': 2,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'FeaturePa

0.8238095238095238

In [43]:
#Save
Modelcb.save_model('Modelcb',
                    format="json",
                    export_parameters=None,
                    pool=None)

#Load
Modelcbloaded = CatBoostClassifier()
Modelcbloaded.load_model('Modelcb', format='json')

<catboost.core.CatBoostClassifier at 0x2cb4c485bd0>

In [19]:
res = Modelcatboost.calc_feature_statistics(X_train,
                                    y_train,
                                    feature=2,
                                    plot=True)

#XGBOOST

In [20]:
#Test gridsearch XGboost
#https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
#scoring - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
Modelxgboost = XGBClassifier(**xgb_params)

grid_search_xgb = GridSearchCV(Modelxgboost, param_grid_xgboost, cv=5, scoring='roc_auc', return_train_score=True, verbose=True, n_jobs = 10)
grid_search_xgb .fit(X_train, y_train)

display(grid_search_xgb.best_estimator_)
display(grid_search_xgb.best_params_)
display(grid_search_xgb.best_score_)

Modelxgboost = grid_search_xgb.best_estimator_.fit(X_train, y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits



is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


Parameters: { "@tree_method" } are not used.




{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150}

0.8090916032029822


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


Parameters: { "@tree_method" } are not used.




#Use and test model

In [21]:
#Use better model and parametres
#ModelLR = pipe_ML(ml_pipe_LR, param_grid_LR)[1]
ModelLR.score(X_test, y_test)

0.6923520923520924

In [22]:
#ModelGBC = pipe_ML(ml_pipe_GBC, param_grid_GBC)[1]
ModelGBC.score(X_test, y_test)

0.7304473304473305

In [23]:
#Predict data that must give 1
test1 = dataset[dataset['default.payment.next.month']==1].iloc[[11]].drop('default.payment.next.month', axis=1)
display(ModelLR.predict(test1)[0])
display(ModelGBC.predict(test1)[0])

1

0

In [83]:
data = dataset[dataset['default.payment.next.month']==1]
data = data.drop('default.payment.next.month', axis = 1)
data

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:1,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3
10,220000.0,38.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22145.0,5529.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,100000.0,27.0,-1.0,2.0,0.0,0.0,0.0,0.0,17553.0,10628.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,80000.0,30.0,2.0,0.0,0.0,-1.0,-1.0,-2.0,4794.0,4989.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14,120000.0,61.0,1.0,2.0,0.0,0.0,0.0,0.0,121709.0,78369.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19,140000.0,24.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,16343.0,1462.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23992,280000.0,32.0,1.0,-2.0,-1.0,0.0,0.0,-2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23993,160000.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,165686.0,169969.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
resultLR = pd.DataFrame(ModelLR.predict(data))
resultLR.value_counts()

#Many False Negative

1    2949
0    2374
Name: count, dtype: int64

In [26]:
resultGBC = pd.DataFrame(ModelGBC.predict(data))
resultGBC.value_counts()

#Many False Negative

1    3367
0    1956
Name: count, dtype: int64

In [27]:
resultRidge = pd.DataFrame(ModelRidge.predict(data))
resultRidge.value_counts()

0.539477    2
0.601285    2
0.871622    2
0.610600    2
0.592988    2
           ..
0.449398    1
0.449349    1
0.449271    1
0.449021    1
2.273782    1
Name: count, Length: 5318, dtype: int64

In [33]:
resultCatboost = pd.DataFrame(Modelcb.predict(data))
resultCatboost.value_counts()

1    4613
0     710
Name: count, dtype: int64

In [29]:
resultxgboost = pd.DataFrame(Modelxgboost.predict(data))
resultxgboost.value_counts()


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead



1    3392
0    1931
Name: count, dtype: int64

In [44]:
#From saved model
resultCatboostloaded = pd.DataFrame(Modelcbloaded.predict(data))
resultCatboostloaded.value_counts()

1    4613
0     710
Name: count, dtype: int64

DATA Analysis

In [45]:
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [46]:
X_train, X_test, y_train, y_test = train_test_split(
        datasetnew.drop('default.payment.next.month', axis=1),
        datasetnew['default.payment.next.month'],
        test_size=0.2,
        stratify=datasetnew['default.payment.next.month'],
        random_state=42)


standardscaler = StandardScaler()
standardscaler.fit(X_train)
X_train_standardscaled = standardscaler.transform(X_train)
X_test_standardscaled = standardscaler.transform(X_test)

In [58]:
Modelcbscaled = CatBoostClassifier(**cb_params)
Modelcbscaled.grid_search(param_grid_catboost, 
                                       X=X_train_standardscaled, 
                                       y=y_train,                                       
                                       plot=True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

bestTest = 0.5356983003
bestIteration = 999
0:	loss: 0.5356983	best: 0.5356983 (0)	total: 26.7s	remaining: 12m 55s
bestTest = 0.5219141312
bestIteration = 999
1:	loss: 0.5219141	best: 0.5219141 (1)	total: 53.5s	remaining: 12m 28s
bestTest = 0.536768089
bestIteration = 999
2:	loss: 0.5367681	best: 0.5219141 (1)	total: 1m 20s	remaining: 12m 4s
bestTest = 0.5203946376
bestIteration = 999
3:	loss: 0.5203946	best: 0.5203946 (3)	total: 1m 47s	remaining: 11m 39s
bestTest = 0.5373842094
bestIteration = 999
4:	loss: 0.5373842	best: 0.5203946 (3)	total: 2m 14s	remaining: 11m 11s
bestTest = 0.5180760944
bestIteration = 995
5:	loss: 0.5180761	best: 0.5180761 (5)	total: 2m 40s	remaining: 10m 43s
bestTest = 0.5374153434
bestIteration = 999
6:	loss: 0.5374153	best: 0.5180761 (5)	total: 3m 6s	remaining: 10m 11s
bestTest = 0.5231540785
bestIteration = 999
7:	loss: 0.5231541	best: 0.5180761 (5)	total: 3m 32s	remaining: 9m 43s
bestTest = 0.5381174239
bestIteration = 999
8:	loss: 0.5381174	best: 0.5180761

{'params': {'depth': 10, 'l2_leaf_reg': 3, 'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [60]:
display(Modelcbscaled.best_score_)
display(Modelcbscaled.is_fitted())
display(Modelcbscaled.get_params())
display(Modelcbscaled.get_all_params())
display(Modelcbscaled.score(X_test, y_test))  #in unscaled was 0.8238095238095238

#HERE IS - FILL IT 0.8164502164502164 Why it became worse after sceling? But in the pipeline below with scaling - the result is 0.9111111111111111!

{'learn': {'Logloss': 0.30742515886581756}}

True

{'loss_function': 'Logloss',
 'verbose': False,
 'task_type': 'GPU',
 'boost_from_average': True,
 'depth': 10,
 'l2_leaf_reg': 3,
 'learning_rate': 0.1}

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'Logloss',
 'iterations': 1000,
 'fold_permutation_block': 64,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Ordered',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '-1',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'class_names': [0, 1],
 'random_seed': 0,
 'depth': 10,
 'has_time': False,
 'fold_len_multiplier': 2,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'FeaturePa

0.5411255411255411

In [66]:
resultModelcbscaled = pd.DataFrame(Modelcbscaled.predict(data))
resultModelcbscaled.value_counts()
#Very bad result after this method of scaling!!!

0    5298
1      25
Name: count, dtype: int64

In [67]:
#Save
Modelcbscaled.save_model('Modelcbscaled',
                    format="json",
                    export_parameters=None,
                    pool=None)

Pipeline with catboost

In [148]:
#Then try bootstrap_type in catboost parameters. Use better hyperparameters 'learning_rate': [0.1], 'depth': [10], 'l2_leaf_reg': [7].
#https://makesomecode.me/2021/11/intro-to-optuna/
#without this method of scaling

estimator = CatBoostClassifier(**cb_best_params, verbose=False)

CB_pipeline = Pipeline([
    ('preprocessor', transformer),
    ('classifier', estimator)
])
CB_pipeline.fit(X_train, y_train)

In [150]:
display(CB_pipeline.score(X_test, y_test))

0.9261183261183261

In [151]:
data1 = dataset[dataset['default.payment.next.month']==1]
data1 = data1.drop('default.payment.next.month', axis = 1)

In [152]:
resultCatboostpipe = pd.DataFrame(CB_pipeline.predict(data1))
resultCatboostpipe.value_counts()

1    5290
0      33
Name: count, dtype: int64

In [153]:
data0 = dataset[dataset['default.payment.next.month']==0]
data0 = data0.drop('default.payment.next.month', axis = 1)

In [154]:
resultCatboostpipe = pd.DataFrame(CB_pipeline.predict(data0))
resultCatboostpipe.value_counts()

0    18349
1      328
Name: count, dtype: int64

In [137]:
#Catboost with real dataset - worseresult

X_train1, X_test1, y_train1, y_test1 = train_test_split(
        dataset.drop('default.payment.next.month', axis=1),
        dataset['default.payment.next.month'],
        test_size=0.2,
        stratify=dataset['default.payment.next.month'],
        random_state=42)

In [138]:
estimator1 = CatBoostClassifier(**cb_best_params, verbose=False)

CB_pipeline1 = Pipeline([
    ('preprocessor', transformer),
    ('classifier', estimator1)
])
CB_pipeline1.fit(X_train1, y_train1)

In [139]:
display(CB_pipeline1.score(X_test1, y_test1))

0.80125

In [140]:
resultCatboostpipe = pd.DataFrame(CB_pipeline1.predict(data0))
resultCatboostpipe.value_counts()

0    18536
1      141
Name: count, dtype: int64

In [141]:
resultCatboostpipe = pd.DataFrame(CB_pipeline1.predict(data1))
resultCatboostpipe.value_counts()

1    4728
0     595
Name: count, dtype: int64

In [None]:
#On 19.10.2023:
'''
- use multiplication of low target class
- use     cb_best_params = {'learning_rate': 0.1,
                  'depth': 10,
                  'task_type': "GPU",
                  'boost_from_average': True,
                  'loss_function': 'Logloss',
                  'l2_leaf_reg': 7
#                       'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
                    }

- use 
    
        
    estimator = CatBoostClassifier(**cb_best_params, verbose=False)

    CB_pipeline = Pipeline([
        ('preprocessor', transformer),
        ('classifier', estimator)
    ])
    CB_pipeline.fit(X_train, y_train)    
    
- use split - 20% test, 80% train
   

Task - use feature creation, importance and corellation    
    
'''