In [1]:
from math import nan
from copy import deepcopy
from datetime import datetime
import itertools
from multiprocessing import Pool
import pickle

In [2]:
import numpy as np
import numpy.random
import pandas as pd
import matplotlib.pyplot as plt
import joblib

In [3]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

In [4]:
from load import read_data, read, write, get_full_prepared_data_with_upsample
from processing import start_processing, family_of_knn_features
from parameters import RANDOM_SEED, SCORERS, TARGET_FEATURE, TEST_SIZE, THREADS, TARGET_REPLACER, INVERSE_TARGET_REPLACER
from preparing import (MyOheHotEncoder, MyOrdinalEncoder, MyMinMaxScaler, ColumnsSorter,
                       EmptyColFiller, MyPolynomialFeatures, ordinal_encoding, one_hot_encoding,
                       col_cutter, col_retainer)
from model_preparing import CBC, CBCt, LRC, ConsecutiveEstimator, ConsecutiveEstimatorProba, StackEstimator
from tuning import try_each_col, try_wo_each_col, try_cols_in_order

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_info_columns', 1000)

In [6]:
%%time
f, ft, fp = get_full_prepared_data_with_upsample()

Wall time: 21 ms


In [7]:
%%time
cols_base = ['condition', 'faculty', 'start_year', 'start_year_val', 'mean_mark_type2', 'city', 'birthday_year', 'school_type', 'mean_mark', 'region', 'diff_between_school_n_start', 'has_not_family', 'gender', 'mean_mark_add1', 'mean_mark_add2', 'country', 'relativies_country', 'language', 'school', 'school_location']
cols_add = [tuple(),
            ('group_code', ),
            ('group_code_num', ),
            ('group_code_add_1', 'group_code_add_2', 'group_code_add_3', 'group_code_add_4',),
            ('k_5', ),
            ('same_group_target', ),
            ('same_group_target', 'group_code_num'),]
# tuple(),
# ('group_code', ),
# per_float_feature_quantization=['0:border_count=1024', '1:border_count=1024']
cols = [list(x) + cols_base for x in cols_add]
cols = [[y for y in x if y in f.columns] for x in cols]
models = [CBCt(transformers=[col_retainer(x)]) for x in cols]
# models = [LRC(transformers=[col_retainer(x)]) for x in cols]
# models = [CBC(transformers=[col_retainer(x)]) for x in cols] + [LRC(transformers=[col_retainer(x)]) for x in cols]
# models = [x.fit(*ft[0]) for x in models]
res = []
# res = [x.test(ft) for x in models]
# pd.concat(res)

Wall time: 0 ns


In [8]:
# import optuna
# from optuna.samplers import TPESampler
# import pathlib

# def objective(trial):
#     param = {'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1, log=True),
#              'depth': trial.suggest_int('depth', 1, 12),
#              'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
#              'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
#              'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10),
#              'random_strength': trial.suggest_float('random_strength', 0.1, 10),}
#              #'border_count': trial.suggest_categorical('border_count', [254, 1024]),}

#     if param['bootstrap_type'] == 'Bayesian':
#         param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
#     elif param['bootstrap_type'] == 'Bernoulli':
#         param['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)
#     if False and param['boosting_type'] == 'Plain':
#         param['grow_policy'] = trial.suggest_categorical('grow_policy', ['Depthwise', 'Lossguide'])
#     else:
#         param['grow_policy'] = 'SymmetricTree'

#     # pruning_callback = CatBoostPruningCallback(trial, 'Accuracy')
#     # callbacks=[pruning_callback]
#     # pruning_callback.check_pruned()
#     print(param)
#     return CBC(model_pars=param).test(ft)['test']

# file_name = 'study.pkl'
# if pathlib.Path(file_name).exists():
#     print('loaded')
#     study = joblib.load(file_name)
# else:
#     print('created')
#     study = optuna.create_study(sampler=TPESampler(), direction='maximize')
# print('start')
# while True:
#     study.optimize(objective, n_trials=10)
#     print('saving')
#     joblib.dump(study, file_name)

In [9]:
# %%time
# model = StackEstimator(models, LRC())
# res.append(model.test(ft))
# display(pd.concat(res))
# CBC
# nontuned 0.857792	0.860158	-0.002366	0.044328
# tuned 0.909756	0.855297	0.054458	0.040217
# LRC
# 0.858018	0.829517	0.028501	0.03825

In [10]:
# %%time
# final_estimator = CBC()
# model = StackEstimator(models, final_estimator)
# res.append(model.test(ft))
# display(pd.concat(res))
# CBC
    # test 20%
    # cv2 0.889794	0.877558	0.012236	0.046382
    # cv5 0.895343	0.876642	0.018701	0.047652
    # test 40%
    # cv2 0.891388	0.862891	0.028497	0.040038
# LRC
    # test 20%
    # cv2 0.886346	0.860545	0.025802	0.040699

In [11]:
%%time
model = StackEstimator(models, LRC())
model = ConsecutiveEstimator(deepcopy(model), deepcopy(model))
res.append(model.test(ft))
display(pd.concat(res))
# CBC
    # LR 0.852251	0.806898	0.045354	0.038858
    # LR tuned 0.925821	0.859381	0.06644	0.040885
    # CBC 0.898411	0.864759	0.033652	0.040996
    # LR tuned 0.904646	0.83091	0.073736	0.04028
    # LR tuned wo 1, 2 0.92494	0.860949	0.063991	0.041264
    # CBC tuned wo 1, 2 0.898471	0.825322	0.073149	0.039107
# LR
    # LR 0.849615	0.824088	0.025526	0.044408
    # CBC 0.892914	0.872629	0.020284	0.039681
# LR + CB
    # CBC 0.904463	0.876612	0.027851	0.039438
    # LRC 0.866441	0.83324	0.033201	0.042145

Unnamed: 0,train,test,delta,std
0,0.923475,0.861155,0.06232,0.04046


Wall time: 22min 57s


In [12]:
# %%time
# model = StackEstimator(models, CBC())
# model = ConsecutiveEstimatorProba(deepcopy(model), deepcopy(model))
# res.append(model.test(ft))
# display(pd.concat(res))
# CBC tuned / LRC 0.944448	0.867402	0.077046	0.04873
# CBC tuned / CBC 0.942608	0.864347	0.07826	0.051726

In [1]:
# t0 = model.predict_final(ft, fp, refit=True)