VISUALIZATION OF GRIDSEARCH WITH TQDM, USING PIPELINES AND SEVERAL METHODS

In [1]:
#Gridsearchcv and Randomizedsearchcv - let's use a combined validation that combines random partitioning into training and test samples and k-block cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                    KFold,
                                    ParameterGrid,
                                    cross_val_score,
                                    GridSearchCV,
                                    RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from category_encoders import WOEEncoder, SumEncoder
from tqdm import tqdm_notebook
from datasets import load_dataset
from tqdm import tqdm
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures

import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#https://huggingface.co/datasets/mstz/speeddating
#dataset = load_dataset("mstz/speeddating")["train"]
#https://huggingface.co/datasets/imodels/credit-card
dataset = load_dataset("imodels/credit-card")['train']


In [3]:
dataset = pd.DataFrame(dataset)
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [4]:
dataset['default.payment.next.month'].value_counts() #-> not fine distribution

default.payment.next.month
0    18677
1     5323
Name: count, dtype: int64

In [5]:
#Add less class - important. Shows better results for GradientBoostingClassifier, but worse for LR
diff = len(dataset.loc[dataset['default.payment.next.month']==0])//len(dataset.loc[dataset['default.payment.next.month']==1])
df_1 = dataset.loc[dataset['default.payment.next.month']==1]
df_1 = df_1.loc[df_1.index.repeat(diff)]
datasetnew = pd.concat([dataset.loc[dataset['default.payment.next.month']==0], df_1]).sample(frac=1)

In [6]:
datasetnew['default.payment.next.month'].value_counts() 

default.payment.next.month
0    18677
1    15969
Name: count, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
        datasetnew.drop('default.payment.next.month', axis=1),
        datasetnew['default.payment.next.month'],
        test_size=0.2,
        stratify=datasetnew['default.payment.next.month'],
        random_state=42)

In [8]:
#Categorical and numerical columns
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()


In [9]:
cat_columns

[]

In [10]:
#Pipeline for num and cat
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

transformer = ColumnTransformer(transformers=transformers)


#Pipeline for LR
ml_pipe_LR = Pipeline([
    ('tf', transformer),
    ('logreg', LogisticRegression(solver='lbfgs',
    max_iter=200))
])

#Pipeline for GBC
ml_pipe_GBC = Pipeline([
    ('preprocessor', transformer),
    ('classifier', GradientBoostingClassifier(
    random_state=42, subsample=0.8))
])

#Pipeline for ridge
ml_pipe_ridge = Pipeline([
    ('preprocessor', transformer),
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(2, interaction_only=True, include_bias=False)),  #3 works better, ROC-AUC show bad results
    ('model', Ridge())
])


#Pipeline for Catboost
cb_params = {'verbose': False,
             'boost_from_average': True,
             'loss_function': 'Logloss'
            }
ml_pipe_catboost = Pipeline([
    ('preprocessor', transformer),
    ('classifier', CatBoostClassifier(**cb_params))
])

#Gridsearch parameters
param_grid_LR = {
    'tf__num__imp__strategy': ['mean', 'median', 'constant'],
    'tf__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': [.01, .1, .5, 1, 5, 10, 100]
}

param_grid_GBC = [{'classifier__max_depth': [2, 4],
                   'classifier__n_estimators': [50, 100]}]

param_grid_Ridge = {
    'model__alpha': np.arange(0, 0.2, 0.01)}

#from here - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_grid_search
param_grid_catboost = {
#    'learning_rate': [0.03, 0.1],
#                       'depth': [4, 6, 10],
#                       'l2_leaf_reg': [1, 3, 5, 7, 9]
                }


In [11]:
def pipe_ML(pipe, grid):
       scores = []
       scoring = 'roc_auc' 
       print("Looking for best model hyperparameters...")

       #Use tqdm and ParameterGrid

       for param in tqdm(list(ParameterGrid(grid)),
                            desc='Done'):

              pipe.set_params(**param)

              scores.append([param, cross_val_score(pipe,
                                                        X_train,
                                                        y_train,
                                                        scoring=scoring,
                                                        cv=5)])

              scores[-1].append(sum(scores[-1][1]) / len(scores[-1][1]))

       scores.sort(reverse=True, key=lambda x: x[2])

       best_params = scores[0][0]
       print("Best hyperparameters:",
              best_params, sep='\n', end='\n')

       best_score = scores[0][2]
       print("Best meaning is %s: %.3f" % (scoring, best_score))

       model = pipe.set_params(**best_params).fit(X_train, y_train)

       test_score = pipe.score(X_test, y_test)

       print("Meaning %s on test: %.3f" % (scoring, test_score))
       
       return ({best_score: best_params}, model) 

In [12]:
#2 Works:
#grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC)]
#grid_dict = {0: 'LR', 1: 'GBC'}

#only ridge
#grids = [(ml_pipe_ridge, param_grid_Ridge)]
#grid_dict = {0: 'Ridge'}

#only Catboost - did not work with gridsearch!?
#grids = [(ml_pipe_catboost, param_grid_catboost)]
#grid_dict = {0: 'Catboost'}

#4 works
grids = [(ml_pipe_LR, param_grid_LR), (ml_pipe_GBC, param_grid_GBC), (ml_pipe_ridge, param_grid_Ridge), (ml_pipe_catboost, param_grid_catboost)]
grid_dict = {0: 'LR', 1: 'GBC', 2: 'Ridge', 3: 'Catboost'}



In [13]:
auc_list = []
best_clf = 0
for idx, gs in enumerate(grids):
    print("Method: %s" % grid_dict[idx])
    #pipe_ML(grids[idx][0], grids[idx][1])
    exec(f"Model{grid_dict[idx]} = pipe_ML(grids[idx][0], grids[idx][1])[1]")
    print()

Method: LR
Looking for best model hyperparameters...


Done: 100%|██████████| 42/42 [00:30<00:00,  1.36it/s]


Best hyperparameters:
{'logreg__C': 10, 'tf__cat__imp__strategy': 'most_frequent', 'tf__num__imp__strategy': 'mean'}
Best meaning is roc_auc: 0.720
Meaning roc_auc on test: 0.708

Method: GBC
Looking for best model hyperparameters...


Done: 100%|██████████| 4/4 [02:33<00:00, 38.37s/it]


Best hyperparameters:
{'classifier__max_depth': 4, 'classifier__n_estimators': 100}
Best meaning is roc_auc: 0.805
Meaning roc_auc on test: 0.748

Method: Ridge
Looking for best model hyperparameters...


Done: 100%|██████████| 20/20 [00:37<00:00,  1.87s/it]


Best hyperparameters:
{'model__alpha': 0.19}
Best meaning is roc_auc: 0.744
Meaning roc_auc on test: 0.184

Method: Catboost
Looking for best model hyperparameters...


Done: 100%|██████████| 1/1 [00:42<00:00, 42.38s/it]


Best hyperparameters:
{}
Best meaning is roc_auc: 0.852
Meaning roc_auc on test: 0.787



In [14]:
ModelCatboost #now the best

#Use and test model

In [15]:
#Use better model and parametres
#ModelLR = pipe_ML(ml_pipe_LR, param_grid_LR)[1]
ModelLR.score(X_test, y_test)

0.707936507936508

In [16]:
#ModelGBC = pipe_ML(ml_pipe_GBC, param_grid_GBC)[1]
ModelGBC.score(X_test, y_test)

0.7480519480519481

In [17]:
#Predict data that must give 1
test1 = dataset[dataset['default.payment.next.month']==1].iloc[[11]].drop('default.payment.next.month', axis=1)
display(ModelLR.predict(test1)[0])
display(ModelGBC.predict(test1)[0])

1

0

In [18]:
data = dataset[dataset['default.payment.next.month']==1]
data = data.drop('default.payment.next.month', axis = 1)
data

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:1,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3
10,220000.0,38.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22145.0,5529.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12,100000.0,27.0,-1.0,2.0,0.0,0.0,0.0,0.0,17553.0,10628.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13,80000.0,30.0,2.0,0.0,0.0,-1.0,-1.0,-2.0,4794.0,4989.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14,120000.0,61.0,1.0,2.0,0.0,0.0,0.0,0.0,121709.0,78369.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19,140000.0,24.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,16343.0,1462.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23992,280000.0,32.0,1.0,-2.0,-1.0,0.0,0.0,-2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23993,160000.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,165686.0,169969.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
resultLR = pd.DataFrame(ModelLR.predict(data))
resultLR.value_counts()

#Many False Negative

1    2937
0    2386
Name: count, dtype: int64

In [22]:
resultGBC = pd.DataFrame(ModelGBC.predict(data))
resultGBC.value_counts()

#Many False Negative

1    3383
0    1940
Name: count, dtype: int64

In [21]:
resultRidge = pd.DataFrame(ModelRidge.predict(data))
resultRidge.value_counts()

0.577051    2
0.573616    2
0.555590    2
0.870893    2
0.604738    2
           ..
0.448469    1
0.448336    1
0.448327    1
0.448254    1
2.124362    1
Name: count, Length: 5318, dtype: int64

In [23]:
resultCatboost = pd.DataFrame(ModelCatboost.predict(data))
resultCatboost.value_counts()

1    3987
0    1336
Name: count, dtype: int64