VISUALIZATION OF GRIDSEARCH WITH TQDM, USING PIPELINES AND SEVERAL METHODS

In [28]:
#Gridsearchcv and Randomizedsearchcv - let's use a combined validation that combines random partitioning into training and test samples and k-block cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                    KFold,
                                    ParameterGrid,
                                    cross_val_score,
                                    GridSearchCV,
                                    RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from category_encoders import WOEEncoder, SumEncoder
from tqdm import tqdm_notebook
from datasets import load_dataset
from tqdm import tqdm
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures

import catboost
import xgboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from xgboost import XGBClassifier

In [2]:
#https://huggingface.co/datasets/mstz/speeddating
#dataset = load_dataset("mstz/speeddating")["train"]
#https://huggingface.co/datasets/imodels/credit-card
dataset = load_dataset("imodels/credit-card")['train']


In [3]:
dataset = pd.DataFrame(dataset)
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [4]:
dataset['default.payment.next.month'].value_counts() #-> not fine distribution

default.payment.next.month
0    18677
1     5323
Name: count, dtype: int64

In [5]:
#Add less class - important. Shows better results for GradientBoostingClassifier, but worse for LR
diff = len(dataset.loc[dataset['default.payment.next.month']==0])//len(dataset.loc[dataset['default.payment.next.month']==1])
df_1 = dataset.loc[dataset['default.payment.next.month']==1]
df_1 = df_1.loc[df_1.index.repeat(diff)]
datasetnew = pd.concat([dataset.loc[dataset['default.payment.next.month']==0], df_1]).sample(frac=1)

In [6]:
datasetnew['default.payment.next.month'].value_counts() 

default.payment.next.month
0    18677
1    15969
Name: count, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
        datasetnew.drop('default.payment.next.month', axis=1),
        datasetnew['default.payment.next.month'],
        test_size=0.2,
        stratify=datasetnew['default.payment.next.month'],
        random_state=42)

In [8]:
#Categorical and numerical columns
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()


In [9]:
cat_columns

[]

In [122]:
#Pipeline for num and cat
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

transformer = ColumnTransformer(transformers=transformers)


#Pipeline for LR
ml_pipe_LR = Pipeline([
    ('tf', transformer),
    ('logreg', LogisticRegression(solver='lbfgs',
    max_iter=200))
])

#Pipeline for GBC
ml_pipe_GBC = Pipeline([
    ('preprocessor', transformer),
    ('classifier', GradientBoostingClassifier(
    random_state=42, subsample=0.8))
])

#Pipeline for ridge
ml_pipe_ridge = Pipeline([
    ('preprocessor', transformer),
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(2, interaction_only=True, include_bias=False)),  #3 works better, ROC-AUC show bad results
    ('model', Ridge())
])


#Pipeline for Catboost
cb_params = {'verbose': False,
             'boost_from_average': True,
             'loss_function': 'Logloss'
            }
ml_pipe_catboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', CatBoostClassifier(**cb_params, task_type="GPU"))
])


#Pipeline for XGboost

xgb_params = {'n_estimators': 280,
              'learning_rate': 0.05,
              'max_depth': 10,
              'subsample': 1.0,
              'colsample_bytree': 1.0,
              '@tree_method': 'hist',
              'enable_categorical': True,
              'verbosity': 1,
              'min_child_weight': 3,
              'random_state': 1}

ml_pipe_xgboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', XGBClassifier(**xgb_params))
])

#Gridsearch parameters
param_grid_LR = {
    'tf__num__imp__strategy': ['mean', 'median', 'constant'],
    'tf__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': [.01, .1, .5, 1, 5, 10, 100]
}

param_grid_GBC = [{'classifier__max_depth': [2, 4],
                   'classifier__n_estimators': [50, 100]}]

param_grid_Ridge = {
    'model__alpha': np.arange(0, 0.2, 0.01)}

#from here - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_grid_search                
param_grid_catboost = {'learning_rate': [0.03, 0.1],
                       'depth': [4, 6, 10],
                       'l2_leaf_reg': [1, 3, 5, 7, 9]
                    }

param_grid_xgboost = {
#     'max_depth': range (2, 10, 2),
#     'n_estimators': range(100, 200, 50),
#     'learning_rate': [0.1, 0.01, 0.05]
}


In [123]:
def pipe_ML(pipe, grid):
       scores = []
       scoring = 'roc_auc' 
       print("Looking for best model hyperparameters...")

       #Use tqdm and ParameterGrid

       for param in tqdm(list(ParameterGrid(grid)),
                            desc='Done'):

              pipe.set_params(**param)

              scores.append([param, cross_val_score(pipe,
                                                        X_train,
                                                        y_train,
                                                        scoring=scoring,
                                                        cv=5)])

              scores[-1].append(sum(scores[-1][1]) / len(scores[-1][1]))

       scores.sort(reverse=True, key=lambda x: x[2])

       best_params = scores[0][0]
       print("Best hyperparameters:",
              best_params, sep='\n', end='\n')

       best_score = scores[0][2]
       print("Best meaning is %s: %.3f" % (scoring, best_score))

       model = pipe.set_params(**best_params).fit(X_train, y_train)

       test_score = pipe.score(X_test, y_test)

       print("Meaning %s on test: %.3f" % (scoring, test_score))
       
       return ({best_score: best_params}, model) 

In [124]:
#2 Works:
#grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC)]
#grid_dict = {0: 'LR', 1: 'GBC'}

#only ridge
#grids = [(ml_pipe_ridge, param_grid_Ridge)]
#grid_dict = {0: 'Ridge'}

#only Catboost - did not work with gridsearch!?
#grids = [(ml_pipe_catboost, param_grid_catboost)]
#grid_dict = {0: 'Catboost'}

#only XGboost
grids = [(ml_pipe_xgboost, param_grid_xgboost)]
grid_dict = {0: 'xgboost'}


#3 works, no catboost
#grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC), (ml_pipe_ridge, param_grid_Ridge)]
#grid_dict = {0: 'LR', 1: 'GBC', 2: 'Ridge'}



In [125]:
auc_list = []
best_clf = 0
for idx, gs in enumerate(grids):
    print("Method: %s" % grid_dict[idx])
    #pipe_ML(grids[idx][0], grids[idx][1])
    exec(f"Model{grid_dict[idx]} = pipe_ML(grids[idx][0], grids[idx][1])[1]")
    print()

Method: xgboost
Looking for best model hyperparameters...


  if is_sparse(data):
Parameters: { "@tree_method" } are not used.

  if is_sparse(data):
Parameters: { "@tree_method" } are not used.

  if is_sparse(data):
Parameters: { "@tree_method" } are not used.

  if is_sparse(data):
Parameters: { "@tree_method" } are not used.

  if is_sparse(data):
Parameters: { "@tree_method" } are not used.

Done: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]
  if is_sparse(data):
Parameters: { "@tree_method" } are not used.



Best hyperparameters:
{}
Best meaning is roc_auc: 0.903
Meaning roc_auc on test: 0.844



In [14]:
#Test gridsearch Catboost
ModelCatboost = CatBoostClassifier(task_type="GPU")
grid_search_result = ModelCatboost.grid_search(param_grid_catboost, 
                                       X=X_train, 
                                       y=y_train,                                       
                                       plot=True)
print(ModelCatboost.best_score_)
print(ModelCatboost.best_iteration_)
print(ModelCatboost.eval_metrics)
print(ModelCatboost.score(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6862356	test: 0.6862002	best: 0.6862002 (0)	total: 27.6ms	remaining: 27.5s
1:	learn: 0.6792929	test: 0.6791694	best: 0.6791694 (1)	total: 54.1ms	remaining: 27s
2:	learn: 0.6731580	test: 0.6730142	best: 0.6730142 (2)	total: 80.5ms	remaining: 26.7s
3:	learn: 0.6670614	test: 0.6668677	best: 0.6668677 (3)	total: 106ms	remaining: 26.3s
4:	learn: 0.6614858	test: 0.6614155	best: 0.6614155 (4)	total: 131ms	remaining: 26.1s
5:	learn: 0.6561107	test: 0.6560450	best: 0.6560450 (5)	total: 156ms	remaining: 25.8s
6:	learn: 0.6510554	test: 0.6510572	best: 0.6510572 (6)	total: 181ms	remaining: 25.7s
7:	learn: 0.6465854	test: 0.6465529	best: 0.6465529 (7)	total: 206ms	remaining: 25.5s
8:	learn: 0.6420191	test: 0.6419832	best: 0.6419832 (8)	total: 231ms	remaining: 25.5s
9:	learn: 0.6380206	test: 0.6379522	best: 0.6379522 (9)	total: 256ms	remaining: 25.3s
10:	learn: 0.6341021	test: 0.6341200	best: 0.6341200 (10)	total: 282ms	remaining: 25.4s
11:	learn: 0.6302724	test: 0.6302130	best: 0.630213

In [26]:
ModelCatboost

TypeError: 'CatBoostClassifier' object is not callable

#Use and test model

In [16]:
#Use better model and parametres
#ModelLR = pipe_ML(ml_pipe_LR, param_grid_LR)[1]
ModelLR.score(X_test, y_test)

0.6965367965367966

In [17]:
#ModelGBC = pipe_ML(ml_pipe_GBC, param_grid_GBC)[1]
ModelGBC.score(X_test, y_test)

0.7396825396825397

In [18]:
#Predict data that must give 1
test1 = dataset[dataset['default.payment.next.month']==1].iloc[[11]].drop('default.payment.next.month', axis=1)
display(ModelLR.predict(test1)[0])
display(ModelGBC.predict(test1)[0])

1

0

In [19]:
data = dataset[dataset['default.payment.next.month']==1]
data = data.drop('default.payment.next.month', axis = 1)
data

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:1,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3
10,220000.0,38.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22145.0,5529.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,100000.0,27.0,-1.0,2.0,0.0,0.0,0.0,0.0,17553.0,10628.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,80000.0,30.0,2.0,0.0,0.0,-1.0,-1.0,-2.0,4794.0,4989.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14,120000.0,61.0,1.0,2.0,0.0,0.0,0.0,0.0,121709.0,78369.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19,140000.0,24.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,16343.0,1462.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23992,280000.0,32.0,1.0,-2.0,-1.0,0.0,0.0,-2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23993,160000.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,165686.0,169969.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
resultLR = pd.DataFrame(ModelLR.predict(data))
resultLR.value_counts()

#Many False Negative

1    2975
0    2348
Name: count, dtype: int64

In [21]:
resultGBC = pd.DataFrame(ModelGBC.predict(data))
resultGBC.value_counts()

#Many False Negative

1    3374
0    1949
Name: count, dtype: int64

In [22]:
resultRidge = pd.DataFrame(ModelRidge.predict(data))
resultRidge.value_counts()

0.587507    2
0.603352    2
0.582077    2
0.551028    2
0.879634    2
           ..
0.450199    1
0.450163    1
0.450157    1
0.450135    1
2.392722    1
Name: count, Length: 5318, dtype: int64

In [23]:
resultCatboost = pd.DataFrame(ModelCatboost.predict(data))
resultCatboost.value_counts()

1    4422
0     901
Name: count, dtype: int64