In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import data and prepare DFs

In [2]:
train_no_label = pd.read_csv('../../data/artificial_train.data', sep=' ', header=None)
train_label = pd.read_csv('../../data/artificial_train.labels', sep=' ', header=None)
test_no_label = pd.read_csv('../../data/artificial_test.data', sep=' ', header=None)

In [3]:
# change the label to 0 and 1
train_label = train_label.replace(-1, 0)

In [4]:
#drop NaN columns
train_no_label = train_no_label.dropna(axis=1)
test_no_label = test_no_label.dropna(axis=1)

In [5]:
train_no_label_columns = ["c" + str(i) for i in range(1, len(train_no_label.columns) + 1)]
train_no_label.columns = train_no_label_columns

test_no_label_columns = ["c" + str(i) for i in range(1, len(test_no_label.columns) + 1)]
test_no_label.columns = test_no_label_columns

In [6]:
train = pd.concat([train_no_label, train_label], axis=1)
train.rename(columns={0: 'label'}, inplace=True)

test_no_label.rename(columns={0: 'label'}, inplace=True)

In [7]:
import copy as cp
test = cp.deepcopy(test_no_label)

In [8]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['label'])

## Preprocess data

### Assess variables based on Model with 1 variable

In [9]:
## Train logistic regression model with one variable at a time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


def train_logistic_regression(train, test, feature, model):
    X_train = train[feature].values.ravel().reshape(-1, 1)
    y_train = train['label'].values.ravel()
    X_test = test[feature].values.ravel().reshape(-1, 1)
    y_test = test['label'].values.ravel()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    from sklearn.metrics import balanced_accuracy_score
    return balanced_accuracy_score(y_test, y_pred)

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



In [10]:
def return_model_name(model):
    if isinstance(model, LogisticRegression):
        return 'log_reg'
    elif isinstance(model, RandomForestClassifier):
        return 'random_forest'
    elif isinstance(model, CatBoostClassifier):
        return 'catboost'
    elif isinstance(model, LGBMClassifier):
        return 'lgbm'
    elif isinstance(model, XGBClassifier):
        return 'xgb'
    else:
        return 'unknown'

In [11]:
from tqdm.auto import tqdm
models = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy'), LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear'), RandomForestClassifier(random_state=42, class_weight='balanced'), LGBMClassifier(random_state=42), XGBClassifier(random_state=42)]
results_logistic_regression = dict()
for model in models:
    results_logistic_regression[return_model_name(model)] = cp.deepcopy(dict())
    for feature in train_no_label_columns:
        results_logistic_regression[return_model_name(model)][feature] = None


### Logistic Regression - feature selection

In [12]:
# train if results are not available in results folder
import os
import pickle

if os.path.exists('results/results_logistic_regression.pickle'):
    with open('results/results_logistic_regression.pickle', 'rb') as handle:
        results_logistic_regression = pickle.load(handle)
else:
    for model in tqdm(models, desc='models'):
        for feature in tqdm(train_no_label_columns, desc=f'features for {return_model_name(model)}'):
            results_logistic_regression[return_model_name(model)][feature] = train_logistic_regression(train, val, feature, model)
    if not os.path.exists('results'):
        os.makedirs('results')

    if not os.path.exists('results/results_logistic_regression.pickle'):
        with open('results/results_logistic_regression.pickle', 'wb') as handle:
            pickle.dump(results_logistic_regression, handle, protocol=pickle.HIGHEST_PROTOCOL)


models:   0%|          | 0/5 [00:00<?, ?it/s]

features for catboost:   0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
results_logistic_regression = pd.DataFrame(results_logistic_regression)

In [None]:
catboost_over50 = results_logistic_regression[results_logistic_regression['catboost'] > 0.5]
log_reg_over50 = results_logistic_regression[results_logistic_regression['log_reg'] > 0.5]
random_forest_over50 = results_logistic_regression[results_logistic_regression['random_forest'] > 0.5]
lgbm_over50 = results_logistic_regression[results_logistic_regression['lgbm'] > 0.5]
xgb_over50 = results_logistic_regression[results_logistic_regression['xgb'] > 0.5]

In [None]:
catboost_over50.shape, log_reg_over50.shape, random_forest_over50.shape, lgbm_over50.shape, xgb_over50.shape

((249, 5), (239, 5), (245, 5), (243, 5), (249, 5))

We will keep the intersection of the variables selected by the two methods

In [None]:
catboost_over50 = catboost_over50.index
log_reg_over50 = log_reg_over50.index
random_forest_over50 = random_forest_over50.index
lgbm_over50 = lgbm_over50.index
xgb_over50 = xgb_over50.index

# intersection of all models

intersection = set(catboost_over50).intersection(set(log_reg_over50)).intersection(set(random_forest_over50)).intersection(set(lgbm_over50)).intersection(set(xgb_over50))
len(intersection)

columns_to_keep_lr = list(intersection) + ['label']

In [None]:
train_lr = train[columns_to_keep_lr]
val_lr = val[columns_to_keep_lr]
columns_to_keep_lr.remove('label')
test_lr = test[columns_to_keep_lr]

### Boruta - feature selection

In [None]:
from boruta_unreleased.boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# Boruta for all features
rfc = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_selector_all = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=42)
boruta_selector_all.fit(train.drop(['label'], axis=1).values, train['label'].values)

# Boruta for selected features by logistic regression
rfc = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_selector_lr = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=42)
boruta_selector_lr.fit(train_lr.drop(['label'], axis=1).values, train_lr['label'].values)

columns_to_keep_boruta = np.array(train.drop(['label'], axis=1).columns)[boruta_selector_all.support_].tolist()
columns_to_keep_lr_boruta = np.array(train_lr.drop(['label'], axis=1).columns)[boruta_selector_lr.support_].tolist()

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	500
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	24
Rejected: 	476
Iteration: 	9 / 100
Confirmed: 	14
Tentative: 	10
Rejected: 	476
Iteration: 	10 / 100
Confirmed: 	14
Tentative: 	10
Rejected: 	476
Iteration: 	11 / 100
Confirmed: 	14
Tentative: 	10
Rejected: 	476
Iteration: 	12 / 100
Confirmed: 	19
Tentative: 	5
Rejected: 	476
Iteration: 	13 / 100
Confirmed: 	19
Tentative: 	5
Rejected: 	476
Iteration: 	14 / 100
Confirmed: 	19
Tentative: 	5
Rejected: 	476
Iteration: 	15 / 100
Confirmed: 	19
Tentative: 	5
Rejected: 	476
Iteration: 	16 / 100
Confirmed: 	19
Te

In [None]:
columns_to_keep_boruta = columns_to_keep_boruta + ['label']

train_boruta = train[columns_to_keep_boruta]
val_boruta = val[columns_to_keep_boruta]
columns_to_keep_boruta.remove('label')
test_boruta = test[columns_to_keep_boruta]

columns_to_keep_lr_boruta = columns_to_keep_lr_boruta + ['label']

train_lr_boruta = train[columns_to_keep_lr_boruta]
val_lr_boruta = val[columns_to_keep_lr_boruta]
columns_to_keep_lr_boruta.remove('label')
test_lr_boruta = test[columns_to_keep_lr_boruta]

In [None]:
print(columns_to_keep_boruta)
print(columns_to_keep_lr_boruta)
print(len(columns_to_keep_lr, columns_to_keep_lr))

['c29', 'c49', 'c65', 'c106', 'c129', 'c154', 'c242', 'c282', 'c283', 'c319', 'c337', 'c339', 'c379', 'c434', 'c443', 'c452', 'c454', 'c473', 'c476', 'c494']
['c49', 'c283', 'c242', 'c106', 'c337', 'c494', 'c129', 'c339', 'c476', 'c443', 'c65']
['c353', 'c457', 'c177', 'c49', 'c283', 'c299', 'c5', 'c84', 'c201', 'c358', 'c437', 'c242', 'c11', 'c214', 'c392', 'c432', 'c73', 'c310', 'c426', 'c297', 'c189', 'c335', 'c46', 'c300', 'c141', 'c202', 'c394', 'c93', 'c401', 'c121', 'c249', 'c62', 'c239', 'c106', 'c337', 'c497', 'c308', 'c159', 'c128', 'c132', 'c494', 'c382', 'c244', 'c271', 'c45', 'c43', 'c129', 'c126', 'c69', 'c475', 'c252', 'c493', 'c56', 'c370', 'c7', 'c451', 'c27', 'c340', 'c264', 'c339', 'c496', 'c54', 'c476', 'c349', 'c443', 'c65', 'c438', 'c218', 'c403', 'c208', 'c131', 'c215']


In [21]:
# save selected features as pickle
if not os.path.exists('selected_features'):
    os.makedirs('selected_features')

if not os.path.exists('selected_features/columns_to_keep_boruta.pickle'):
    with open('selected_features/columns_to_keep_boruta.pickle', 'wb') as handle:
        pickle.dump(columns_to_keep_boruta, handle, protocol=pickle.HIGHEST_PROTOCOL)

if not os.path.exists('selected_features/columns_to_keep_lr_boruta.pickle'):
    with open('selected_features/columns_to_keep_lr_boruta.pickle', 'wb') as handle:
        pickle.dump(columns_to_keep_lr_boruta, handle, protocol=pickle.HIGHEST_PROTOCOL)

if not os.path.exists('selected_features/columns_to_keep_lr.pickle'):
    with open('selected_features/columns_to_keep_lr.pickle', 'wb') as handle:
        pickle.dump(columns_to_keep_lr, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Grid Search

In [100]:
grids = {
        "lgbm" : {
            "n_estimators": [100, 200, 300, 400, 500],
            "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "num_leaves": [31, 62, 93, 124, 155, 186, 217, 248, 279, 310]
        },
        
        "catboost": {
            "iterations": [100, 200, 300, 400, 500],
            "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
            "depth": [3, 4, 5, 6, 7, 8],
            "l2_leaf_reg": [1, 3, 5, 7, 9]
        },
    
        "xgb": {
            "n_estimators": [100, 200, 300, 400, 500],
            "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "gamma": [0, 1, 2, 3, 4, 5]
        },

        "random_forest" : {
            "n_estimators": [100, 200, 300, 400, 500],
            "max_depth": [3, 4, 5, 6, 7, 8],
            "max_features": ["sqrt", "log2"]
        },
    
        "log_reg" : {
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "penalty": ["l1", "l2"]
        }

}

In [101]:
# supress ConvergenceWarning, FutureWarning
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

In [106]:

import copy as cp
best_params = dict().fromkeys([return_model_name(model) for model in models], cp.deepcopy(dict().fromkeys(['lr', 'bor', 'lr+bor'], dict())))
best_params = dict()

for model in models:
    best_params[return_model_name(model)] = cp.deepcopy(dict())
    for feature in ['lr', 'bor', 'lr+bor']:
        best_params[return_model_name(model)][feature] = None

In [None]:
from sklearn.model_selection import GridSearchCV

for model in tqdm(models):
        for columns, name in zip([columns_to_keep_lr, columns_to_keep_boruta, columns_to_keep_lr_boruta], ['lr', 'bor', 'lr+bor']):
            print(f'Grid search for {return_model_name(model)} with feature selection: {name}')
            clf = GridSearchCV(model, grids[return_model_name(model)], scoring='balanced_accuracy', n_jobs=-1, verbose=1)
            clf.fit(train[columns], train['label'])
            best_params[return_model_name(model)][name] = cp.deepcopy(clf.best_params_)

In [None]:
best_params

In [None]:
# train the models with the best parameters
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
results = dict().fromkeys([return_model_name(model) for model in models], dict().fromkeys(['lr', 'bor', 'lr+bor'], dict()))

models_grid_lr = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['lr']),
                  LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['lr']), 
                  RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['lr']), 
                  LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['lr']), 
                  XGBClassifier(random_state=42, **best_params['xgb']['lr'])]

models_grid_bor = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['bor']),
                    LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['bor']), 
                    RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['bor']), 
                    LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['bor']), 
                    XGBClassifier(random_state=42, **best_params['xgb']['bor'])]

models_grid_lr_bor = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['lr+bor']),
                      LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['lr+bor']),
                      RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['lr+bor']),
                      LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['lr+bor']),
                      XGBClassifier(random_state=42, **best_params['xgb']['lr+bor'])]

models_grid_dict = {'lr': models_grid_lr, 'bor': models_grid_bor, 'lr+bor': models_grid_lr_bor}

for column_sel_type in tqdm(models_grid_dict):
    for model, name in zip(models_grid_dict[column_sel_type], ['catboost', 'log_reg', 'random_forest', 'lgbm', 'xgb']):
        print(f'Training {return_model_name(model)} for feature selection: {column_sel_type}')
        columns = columns_to_keep_lr if column_sel_type == 'lr' else columns_to_keep_boruta if column_sel_type == 'bor' else columns_to_keep_lr_boruta
        print(f'Columns: {columns}')
        model.fit(train[columns], train['label'])
        y_pred = model.predict(val[columns])
        y_pred_proba = model.predict_proba(val[columns])[:, 1]
        auc = roc_auc_score(val['label'], y_pred_proba)
        balanced_accuracy = balanced_accuracy_score(val['label'], y_pred)
        results[return_model_name(model)][column_sel_type] = cp.deepcopy({'auc': auc, 'balanced_accuracy': balanced_accuracy, 'overfit': y_pred.sum() / len(y_pred)})

In [114]:
# save best parameters and results to grid folder
if not os.path.exists('grid'):
    os.makedirs('grid')

if not os.path.exists('grid/best_params.pickle'):
    with open('grid/best_params.pickle', 'wb') as handle:
        pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

if not os.path.exists('grid/results.pickle'):
    with open('grid/results.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)


### Optuna

In [121]:
# create optuna for these models
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

def create_trial_params_catboost(trial):

    params = dict()
    params['iterations'] = trial.suggest_int('iterations', 100, 500)
    params['learning_rate'] = trial.suggest_float('learning_rate', 0.001, 0.3)
    params['depth'] = trial.suggest_int('depth', 3, 8)
    params['l2_leaf_reg'] = trial.suggest_int('l2_leaf_reg', 1, 9)

    return params

def create_trial_params_lgbm(trial):
    
        params = dict()
        params['n_estimators'] = trial.suggest_int('n_estimators', 100, 500)
        params['learning_rate'] = trial.suggest_float('learning_rate', 0.001, 0.3)
        params['max_depth'] = trial.suggest_int('max_depth', 3, 8)
        params['num_leaves'] = trial.suggest_int('num_leaves', 31, 310)
    
        return params

def create_trial_params_xgb(trial):
     
        params = dict()
        params['n_estimators'] = trial.suggest_int('n_estimators', 100, 500)
        params['learning_rate'] = trial.suggest_float('learning_rate', 0.001, 0.3)
        params['max_depth'] = trial.suggest_int('max_depth', 3, 8)
        params['gamma'] = trial.suggest_int('gamma', 0, 5)
    
        return params

def create_trial_params_random_forest(trial):
      
        params = dict()
        params['n_estimators'] = trial.suggest_int('n_estimators', 100, 500)
        params['max_depth'] = trial.suggest_int('max_depth', 3, 8)
        params['max_features'] = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    
        return params

def create_trial_params_log_reg(trial):
      
        params = dict()
        params['C'] = trial.suggest_float('C', 0.001, 1000)
        params['penalty'] = trial.suggest_categorical('penalty', ['l1', 'l2'])
    
        return params

def create_trial_params(trial, model_name):
        if model_name == 'catboost':
                return create_trial_params_catboost(trial)
        elif model_name == 'lgbm':
                return create_trial_params_lgbm(trial)
        elif model_name == 'xgb':
                return create_trial_params_xgb(trial)
        elif model_name == 'random_forest':
                return create_trial_params_random_forest(trial)
        elif model_name == 'log_reg':
                return create_trial_params_log_reg(trial)
        else:
                return None


def objective(trial, model_name, train, columns_to_keep):

        params = create_trial_params(trial, model_name)
        model = None
        if model_name == 'catboost':
                model = CatBoostClassifier(**params, verbose=0, random_state=42, eval_metric='BalancedAccuracy')
        elif model_name == 'lgbm':
                model = LGBMClassifier(**params, random_state=42, eval_metric='BalancedAccuracy')
        elif model_name == 'xgb':
                model = XGBClassifier(**params, random_state=42)
        elif model_name == 'random_forest':
                model = RandomForestClassifier(**params, random_state=42, class_weight='balanced')
        elif model_name == 'log_reg':
                model = LogisticRegression(**params, random_state=42, class_weight='balanced', solver='liblinear')
        else:
                return None

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model, train[columns_to_keep], train['label'], scoring='balanced_accuracy', cv=skf, n_jobs=-1)
        return scores.mean()


In [None]:
best_params = dict()
for model in models:
    best_params[return_model_name(model)] = cp.deepcopy(dict())
    for feature in ['lr', 'bor', 'lr+bor']:
        best_params[return_model_name(model)][feature] = None

study = dict()
for model in models:
    study[return_model_name(model)] = cp.deepcopy(dict())
    for feature in ['lr', 'bor', 'lr+bor']:
        study[return_model_name(model)][feature] = None
        
for model in tqdm(models):
        for columns, name in zip([columns_to_keep_lr, columns_to_keep_boruta, columns_to_keep_lr_boruta], ['lr', 'bor', 'lr+bor']):
                print(f'Optuna for {return_model_name(model)} with feature selection: {name}')
                study[return_model_name(model)][name] = optuna.create_study(direction='maximize')
                study[return_model_name(model)][name].optimize(lambda trial: objective(trial, return_model_name(model), train, columns), n_trials=30)
                best_params[return_model_name(model)][name] = study[return_model_name(model)][name].best_params

In [None]:
# results = dict().fromkeys([return_model_name(model) for model in models], dict().fromkeys(['lr', 'bor', 'lr+bor'], dict()))


results = dict()
for model in models:
    results[return_model_name(model)] = cp.deepcopy(dict())
    for feature in ['lr', 'bor', 'lr+bor']:
        results[return_model_name(model)][feature] = None

models_optuna_lr = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['lr']),
                        LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['lr']),
                        RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['lr']),
                        LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['lr']),
                        XGBClassifier(random_state=42, **best_params['xgb']['lr'])]
models_optuna_bor = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['bor']),
                        LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['bor']),
                        RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['bor']),
                        LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['bor']),
                        XGBClassifier(random_state=42, **best_params['xgb']['bor'])]
models_optuna_lr_bor = [CatBoostClassifier(verbose=0, random_state=42, eval_metric='BalancedAccuracy', **best_params['catboost']['lr+bor']),
                        LogisticRegression(random_state=42, class_weight='balanced', solver='liblinear', **best_params['log_reg']['lr+bor']),
                        RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['random_forest']['lr+bor']),
                        LGBMClassifier(random_state=42, class_weight='balanced', **best_params['lgbm']['lr+bor']),
                        XGBClassifier(random_state=42, **best_params['xgb']['lr+bor'])]

models_optuna_dict = {'lr': models_optuna_lr, 'bor': models_optuna_bor, 'lr+bor': models_optuna_lr_bor}



for column_sel_type in tqdm(models_optuna_dict):
        for model_optuna, name in zip(models_optuna_dict[column_sel_type], [return_model_name(modell) for modell in models_optuna_dict[column_sel_type]]):
                print(f'Training {return_model_name(model_optuna)} for feature selection: {column_sel_type}')
                columns = columns_to_keep_lr if column_sel_type == 'lr' else columns_to_keep_boruta if column_sel_type == 'bor' else columns_to_keep_lr_boruta
                print(f'Columns: {columns}')
                model_optuna.fit(train[columns], train['label'])
                y_pred = model_optuna.predict(val[columns])
                y_pred_proba = model_optuna.predict_proba(val[columns])[:, 1]
                auc = roc_auc_score(val['label'], y_pred_proba)
                balanced_accuracy = balanced_accuracy_score(val['label'], y_pred)
                results[return_model_name(model_optuna)][column_sel_type] = cp.deepcopy({'auc': auc, 'balanced_accuracy': balanced_accuracy, 'overfit': y_pred.sum() / len(y_pred)})
    

In [125]:
# save best parameters and results to optuna folder
if not os.path.exists('optuna'):
    os.makedirs('optuna')

if not os.path.exists('optuna/best_params.pickle'):
    with open('optuna/best_params.pickle', 'wb') as handle:
        pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

if not os.path.exists('optuna/results.pickle'):
    with open('optuna/results.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [126]:
# open results from grid and optuna and find the best balanced accuracy
with open('grid/results.pickle', 'rb') as handle:
    results_grid = pickle.load(handle)

with open('optuna/results.pickle', 'rb') as handle:
    results_optuna = pickle.load(handle)

best_balanced_accuracy = 0
best_feature_selection = None
best_model = None
optimization_method = None

for model in results_grid:
    for feature_selection in results_grid[model]:
        if results_grid[model][feature_selection]['balanced_accuracy'] > best_balanced_accuracy:
            best_balanced_accuracy = results_grid[model][feature_selection]['balanced_accuracy']
            best_feature_selection = feature_selection
            best_model = model
            optimization_method = 'grid'

for model in results_optuna:
    for feature_selection in results_optuna[model]:
        if results_optuna[model][feature_selection]['balanced_accuracy'] > best_balanced_accuracy:
            best_balanced_accuracy = results_optuna[model][feature_selection]['balanced_accuracy']
            best_feature_selection = feature_selection
            best_model = model
            optimization_method = 'optuna'

print(f'Best balanced accuracy: {best_balanced_accuracy}, best feature selection: {best_feature_selection}, best model: {best_model}, optimization method: {optimization_method}')

Best balanced accuracy: 0.9075, best feature selection: bor, best model: catboost, optimization method: optuna


In [127]:
# find the best parameters for the best model
with open(f'{optimization_method}/best_params.pickle', 'rb') as handle:
    best_params = pickle.load(handle)

best_params[best_model][best_feature_selection]

{'iterations': 322,
 'learning_rate': 0.15542656215841885,
 'depth': 8,
 'l2_leaf_reg': 9}