# Подбор моделей и  их гиперпараметров

Рассматриваются различные модели для классификации с параллельным подбором гиперпараметров. Большая часть блокнота ячеек не считается (смотри константу MODEL_SWITCHER), так как отдельные ячейки могут выполнятся долго. Расчет включается по мере интереса или необходимости. Результаты подбора гиперпараметров сохраняются в .pickle файлы. При запуске блокнота необходимо провести подбор с нуля!

In [1]:
from pathlib import Path

In [2]:
import numpy as np
from scipy.stats import mvsdist
from scipy.optimize import minimize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import phik
from joblib import Parallel, delayed

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [4]:
import optuna
from optuna.samplers import TPESampler

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier

In [6]:
from utils import RANDOM_SEED, TARGET_FEATURE, FIG_SIZES, DFS_NAME, SCORER, FEATURE_NAMES, OPTUNA_STUDY_NAME
from utils import read, write, split, score, test_bt, get_pars_from_tune_res

In [7]:
NEED_TUNE = False
TUNE_CV = 3
TUNE_THREADS = 1
NUM_OF_TRIALS = 200

MODEL_SWITCHER = list()
# MODEL_SWITCHER.append('lr_1')
# MODEL_SWITCHER.append('lr_2')
# MODEL_SWITCHER.append('knn_1')
# MODEL_SWITCHER.append('svc_1')
# MODEL_SWITCHER.append('nbc_1')
# MODEL_SWITCHER.append('cb_1')
# MODEL_SWITCHER.append('cb_2')
MODEL_SWITCHER.append('cb_3')
# MODEL_SWITCHER.append('cb_cp_1')
# MODEL_SWITCHER.append('rf_1')
# MODEL_SWITCHER.append('rf_2')

In [8]:
dfs = read(DFS_NAME)

# delete unusefull features that was found on feature importance stage
# features = read(FEATURE_NAMES)
# dfs = [df[features + ([TARGET_FEATURE] if TARGET_FEATURE in df else [])] for df in dfs]

In [9]:
def test_model(model, name, dfs=dfs, train_df=None, fitted=False, **kwargs):
    _, train, *tests, end_test = dfs
    if train_df is not None:
        train = train_df
    if not fitted:
        model.fit(*split(train))
    if len(tests) == 1:
        print('bt test')
        r = test_bt(model, tests[0])
    else:
        r = [score(model, test) for test in tests]
    m, v, s = mvsdist(r)
    return {'name': name, 'model': model.__class__.__name__} | {'train': score(model, train), 'test': m.mean(), 'test_sem': m.std()}

def end_test_model(model, dfs=dfs):
    return model.predict(dfs[-1])

def tune(opt_study_name, objective, parallel=True, num_of_trials=NUM_OF_TRIALS, **kwargs):
    if isinstance(parallel, bool):
        threads = TUNE_THREADS if parallel else 1
    else:
        threads = parallel
    threads = TUNE_THREADS if parallel else 1
    opt_study_name = OPTUNA_STUDY_NAME(opt_study_name)
    if opt_study_name.exists():
        study = read(opt_study_name)
    else:
        study = optuna.create_study(sampler=TPESampler(), direction='maximize')
    for _ in range(num_of_trials // (4 * threads)):
        study.optimize(objective, n_trials=4*threads, n_jobs=threads)
        write(opt_study_name, study)
        print('study is saved')

In [10]:
def get_cols_to_del_by_correlation(pc, edge):
    i, j = np.indices((pc.shape[0], pc.shape[0]))
    c = abs(pc.values) > edge
    corr = sorted(map(sorted, zip(pc.index[i[c]], pc.index[j[c]])))
    corr = set([(x, y) for x, y in corr if x != y])
    deleted = []
    for x, y in corr:
        if x in deleted or y in deleted:
            continue
        deleted.append(max([x, y], key=len))
    return sorted(deleted)

def col_deleter(cols):
    def temp(df, *args):
        return df.drop(columns=list(cols))
    return FunctionTransformer(temp)

def col_deleter_pipe(model, cols):
    return Pipeline([('col_deleter', col_deleter(cols)), ('model', model)])

In [11]:
models = []
res = []
def test(*args, **kwargs):
    res.append(test_model(*args, **kwargs))
    return pd.DataFrame(res[-1:]), args[0]

def model_preparing(name, model_class, objective, common_param=dict(), models=models, **kwargs):
    if name in MODEL_SWITCHER:
        if NEED_TUNE: tune(name, objective, **kwargs)
        params = get_pars_from_tune_res(name, **kwargs)
        r = Parallel(n_jobs=TUNE_THREADS)(delayed(test)(model, name, **kwargs) for model, name in [(model_class(**(common_param | param)), f'{name}_{i}') for i, param in enumerate(params)])
        models += [x[1] for x in r]
        display(pd.concat([x[0] for x in r]))

In [12]:
%%time
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
t, model = test(model, 'lr_1')
models.append(model)
display(t)

bt test


Unnamed: 0,name,model,train,test,test_sem
0,lr_1,LogisticRegression,0.784163,0.795069,0.002956


Wall time: 673 ms


In [13]:
%%time
name = 'knn_1'
model_class = KNeighborsClassifier
def param_filter(param):
    if param['algorithm'] in ['ball_tree', 'kd_tree'] and param['metric'] in ['cosine','nan_euclidean']:
        param['metric'] = 'euclidean'
    return param

def objective(trial):
    param = {'n_neighbors': trial.suggest_int('n_neighbors', 3, 30),
             'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
             'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute'])}
    param['metric'] = trial.suggest_categorical('metric', ['cityblock', 'cosine', 'euclidean'])
    if param['algorithm'] in ['ball_tree', 'kd_tree']:
        param['leaf_size'] = trial.suggest_int('leaf_size', 10, 50)
    param = param_filter(param)
    model = model_class(**param)
    res = cross_val_score(model, *split(dfs[1]), scoring=SCORER, cv=TUNE_CV, verbose=0)
    return np.mean(res)

model_preparing(name, model_class, objective, param_filter=param_filter)

Wall time: 4.47 ms


In [14]:
%%time
name = 'svc_1'
common_param = {'C': 0.01, 'verbose': 0, 'probability': True, 'random_state': RANDOM_SEED}
model_class = SVC
df = dfs[1].sample(10000)

def objective(trial):
    param = {'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
             'shrinking': trial.suggest_categorical('shrinking', [True, False]),}
    if param['kernel'] == 'poly':
        param['degree'] = trial.suggest_int('degree', 1, 6)
    if param['kernel'] in ['poly', 'rbf', 'sigmoid']:
        param['gamma'] = trial.suggest_categorical('gamma', ['scale', 'auto'])
    if param['kernel'] in ['poly', 'sigmoid']:
        param['coef0'] = trial.suggest_float('coef0', 0, 10)
    model = model_class(**(common_param | param))
    res = cross_val_score(model, *split(df), scoring=SCORER, cv=TUNE_CV, verbose=0)
    return np.mean(res)

model_preparing(name, model_class, objective, common_param, train_df=df)

Wall time: 0 ns


In [15]:
%%time
name = 'cb_3'
cat_features = list(dfs[0].select_dtypes(include='category').columns)
common_param = {'verbose': 0, 'random_state': RANDOM_SEED,
                'cat_features': cat_features,}
model_class = CatBoostClassifier

def objective(trial):
    param = {'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
             'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.12),
             'depth': trial.suggest_int('depth', 4, 12),
             'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
             'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])}
    param['iterations'] = trial.suggest_int('iterations', 500, 2000)
    # param['grow_policy'] =  trial.suggest_categorical('grow_policy', ['Lossguide', 'SymmetricTree', 'Depthwise'])
    param['leaf_estimation_backtracking'] =  trial.suggest_categorical('leaf_estimation_backtracking', ['AnyImprovement', 'No'])
    param['l2_leaf_reg'] = trial.suggest_float('l2_leaf_reg', 0.1, 20)

    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    trial.set_user_attr('param', param)
    trial.set_user_attr('common_param', common_param)
    res = cross_val_score(model_class(**(common_param | param)),
                          *split(dfs[1]), scoring=SCORER, cv=TUNE_CV, verbose=0)
    return np.mean(res)

model_preparing(name, model_class, objective, common_param, parallel=False)

bt test


Unnamed: 0,name,model,train,test,test_sem
0,cb_3_0,CatBoostClassifier,0.809533,0.804583,0.003008


Wall time: 7.44 s


In [16]:
%%time
opt_study_name = OPTUNA_STUDY_NAME(name)
if opt_study_name.exists():
    old_params = read(opt_study_name).best_params
    name = 'cb_cp_1'

    def objective(trial):
        old_params |= {'depth': trial.suggest_int('depth', 4, 12),
                       'iterations': trial.suggest_int('iterations', 500, 2000)}
        
        pars_to_tune = ['colsample_bylevel', 'l2_leaf_reg']
        if 'bootstrap_type' in old_params:
            if old_params['bootstrap_type'] == 'Bayesian':
                pars_to_tune.append('bagging_temperature')
            elif old_params['bootstrap_type'] == 'Bernoulli':
                pars_to_tune.append('subsample')
        
        eval_fun = (lambda param, df=dfs[1], model_class=model_class, cv=TUNE_CV, scoring=SCORER:
                    cross_val_score(model_class(**param), *split(df), scoring=scoring, cv=cv, verbose=0))
        def obj(vals, pars_to_tune=pars_to_tune, eval_fun=eval_fun,
                old_params=old_params, common_param=common_param):
            res = np.mean(eval_fun(common_param | old_params | dict(zip(pars_to_tune, vals))))
            print(round(res, 4), end=', ')
            return res
        res = minimize(obj, [old_params[x] for x in pars_to_tune], method='Nelder-Mead',
                       tol=1e-3, options={'maxiter': 100, 'fatol': 1e-4, 'disp': True})
        trial.set_user_attr('param', old_params | dict(zip(pars_to_tune, res.x)))
        trial.set_user_attr('common_param', common_param)
        print()
        return -res.fun

    model_preparing(name, model_class, objective, common_param, parallel=1)

Wall time: 20.4 ms


In [17]:
%%time
name = 'rf_2'
common_param = {'random_state': RANDOM_SEED, 'class_weight': 'balanced'}
model_class = RandomForestClassifier

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    max_depth = trial.suggest_int('max_depth', 2, 100)
    model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, **common_param)
    res = cross_val_score(model, *split(dfs[1]), scoring=SCORER, cv=TUNE_CV, verbose=0)
    return np.mean(res)

model_preparing(name, model_class, objective, common_param)

Wall time: 0 ns


In [18]:
# %%time
# model = StackingClassifier([(str(i), model) for i, model in enumerate(models)],
#                            final_estimator=LogisticRegression(max_iter=1000),
#                            cv='prefit', n_jobs=TUNE_THREADS, passthrough=False)
# t, model = test(model, 'sc_1', fitted=False)
# models.append(model)
# display(t)