In [106]:
#Gridsearchcv and Randomizedsearchcv - let's use a combined validation that combines random partitioning into training and test samples and k-block cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split,
                                    KFold,
                                    ParameterGrid,
                                    cross_val_score,
                                    GridSearchCV,
                                    RandomizedSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from category_encoders import WOEEncoder, SumEncoder
from tqdm import tqdm_notebook
from datasets import load_dataset
from tqdm import tqdm

In [107]:
#https://huggingface.co/datasets/mstz/speeddating
#dataset = load_dataset("mstz/speeddating")["train"]
#https://huggingface.co/datasets/imodels/credit-card
dataset = load_dataset("imodels/credit-card")['train']


In [108]:
dataset = pd.DataFrame(dataset)
dataset

Unnamed: 0,limit_bal,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,...,education:2,education:3,education:4,education:5,education:6,marriage:0,marriage:1,marriage:2,marriage:3,default.payment.next.month
0,80000.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,75125.0,77353.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,30000.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,29242.0,29507.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,180000.0,44.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,20916.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,60000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,58839.0,53235.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,130000.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,111587.0,112348.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,50000.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,52475.0,53600.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23996,200000.0,37.0,2.0,2.0,2.0,2.0,2.0,2.0,157131.0,166590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23997,50000.0,26.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
23998,70000.0,25.0,0.0,0.0,0.0,0.0,2.0,2.0,73939.0,70488.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [109]:
X_train, X_test, y_train, y_test = train_test_split(
        dataset.drop('default.payment.next.month', axis=1),
        dataset['default.payment.next.month'],
        test_size=0.3,
        stratify=dataset['default.payment.next.month'],
        random_state=42)

In [110]:
#Categorical and numerical columns
cat_columns = X_train.select_dtypes(
    include='object').columns.tolist()
num_columns = X_train.select_dtypes(
    exclude='object').columns.tolist()


In [111]:
#Pipeline for LR
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='constant')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


transformers = [('num', num_pipe, num_columns),
                ('cat', cat_pipe, cat_columns)]

transformer = ColumnTransformer(transformers=transformers)


#Pipeline for LR
ml_pipe_LR = Pipeline([
    ('tf', transformer),
    ('logreg', LogisticRegression(solver='lbfgs',
    max_iter=200))
])

#Pipleline for GBC
ml_pipe_GBC = Pipeline([
    ('preprocessor', transformer),
    ('classifier', GradientBoostingClassifier(
    random_state=42, subsample=0.8))
])

#Gridsearch parameters
param_grid_LR = {
    'tf__num__imp__strategy': ['mean', 'median', 'constant'],
    'tf__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': [.01, .1, .5, 1, 5, 10, 100]
}

param_grid_GBC = [{'classifier__max_depth': [2, 4],
                   'classifier__n_estimators': [50, 100]}]



In [123]:
def pipe_LR():
        scores = []
        scoring = 'roc_auc' #classification task
        print("Looking for best LR hyperparameters...")

        #Use tqdm and ParameterGrid

        for param in tqdm(list(ParameterGrid(param_grid_LR)),
                                desc='Done'):

            ml_pipe_LR.set_params(**param)

            scores.append([param, cross_val_score(ml_pipe_LR,
                                                    X_train,
                                                    y_train,
                                                    scoring=scoring,
                                                    cv=5)])

            scores[-1].append(sum(scores[-1][1]) / len(scores[-1][1]))

        scores.sort(reverse=True, key=lambda x: x[2])

        best_params = scores[0][0]
        print("Best hyperparameters LR:",
            best_params, sep='\n', end='\n')

        best_score = scores[0][2]
        print("Best meaning is %s: %.3f" % (scoring, best_score))

        ml_pipe_LR.set_params(**best_params).fit(X_train, y_train)

        test_score = ml_pipe_LR.score(X_test, y_test)

        print("Meaning %s on test: %.3f" % (scoring, test_score))
        
        return (best_score, test_score) 

In [113]:
#progress bar for GBC:
def pipe_GBC():
       scores = []
       scoring = 'roc_auc' #classification task
       print("Looking for best GBC hyperparameters...")

       #Use tqdm and ParameterGrid

       for param in tqdm(list(ParameterGrid(param_grid_GBC)),
                            desc='Done'):

       ml_pipe_GBC.set_params(**param)

       scores.append([param, cross_val_score(ml_pipe_GBC,
                                                 X_train,
                                                 y_train,
                                                 scoring=scoring,
                                                 cv=5)])

       scores[-1].append(sum(scores[-1][1]) / len(scores[-1][1]))

       scores.sort(reverse=True, key=lambda x: x[2])

       best_params = scores[0][0]
       print("Best hyperparameters GBC:",
              best_params, sep='\n', end='\n')

       best_score = scores[0][2]
       print("Best meaning is %s: %.3f" % (scoring, best_score))

       ml_pipe_GBC.set_params(**best_params).fit(X_train, y_train)

       test_score = ml_pipe_GBC.score(X_test, y_test)

       print("Meaning %s on test: %.3f" % (scoring, test_score))
       
       return (best_score, test_score) 

Looking for best GBC hyperparameters...


Done: 100%|██████████| 4/4 [01:45<00:00, 26.43s/it]


Best hyperparameters GBC:
{'classifier__max_depth': 4, 'classifier__n_estimators': 100}
Best meaning is roc_auc: 0.779
Meaning roc_auc on test: 0.822


In [None]:
pipe_LR()

In [None]:
pipe_GBC()