In [1]:
from math import nan
from copy import deepcopy
from datetime import datetime
import itertools
from multiprocessing import Pool
import pickle

In [2]:
import numpy as np
import numpy.random
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

In [20]:
from load import read_data, read, write, get_full_prepared_data_with_upsample
from processing import start_processing, family_of_knn_features
from parameters import RANDOM_SEED, SCORERS, TARGET_FEATURE, TEST_SIZE, THREADS, TARGET_REPLACER, INVERSE_TARGET_REPLACER
from preparing import (MyOheHotEncoder, MyOrdinalEncoder, MyMinMaxScaler, ColumnsSorter,
                       EmptyColFiller, MyPolynomialFeatures, ordinal_encoding, one_hot_encoding,
                       col_cutter, col_retainer)
from model_preparing import CBC, ConsecutiveEstimator, StackEstimator
from tuning import try_each_col, try_wo_each_col, try_cols_in_order

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_info_columns', 1000)

In [6]:
%%time
f, ft, fp = get_full_prepared_data_with_upsample()

Wall time: 26 ms


In [7]:
# def get(val, col, ft=ft):
#     res = []
#     for x1, x2 in ft:
#         f = x1[col] == val
#         res.append((x1[f].drop(columns=[col]), x2[f]))
#     return res

# col = 'start_year'
# for col in ('start_year', 'faculty', 'same_group_target'):
#     res = []
#     for val in ft[0][0][col].unique():
#         res.append({'val': val} | CBC().eval_on_test(get(val, col)))
#     print(col)
#     display(pd.DataFrame(res).drop(columns='model').sort_values('std'))

In [8]:
# %%time
# class TestModel(CommonEstimator):
#     def __init__(self):
#         self.fitted = False
#         self.col = 'same_group_target'
#         self.filt = lambda x: x[self.col] != 0
#         self.clear_cols = lambda x: x.drop(columns=[self.col])
    
#     def fit(self, x, y):
#         filt = self.filt(x)
#         x = self.clear_cols(x)
#         self.model = CBC()
#         self.model.fit(x[filt], y[filt].replace(0, 1))
#         return self
    
#     def predict(self, x, y=None):
#         filt = self.filt(x)
#         x = self.clear_cols(x)
#         f_index = filt[filt].index
#         y = pd.Series(0, x.index)
#         y0 = pd.Series(self.model.predict(x[filt]), index=f_index)
#         y[f_index] = y0
#         return y.astype(int)

# model = TestModel()
# display(pd.DataFrame([model.eval_on_test(ft)]))
# predict_on_test(model, ft, fp, refit=False)
# print()

In [9]:
# %%time
# class ConsecEstimator():
#     def __init__(self, model_class):
#         self.model_class = model_class
#         self.replacer = [{2: 1}, {1: 0, 2: 1}]
#         self.invercer = [{v: k for k, v in x.items()} for x in self.replacer]

#     def fit(self, x, y):
#         self.model = (self.model_class(), self.model_class())
#         self.model[0].fit(x, y.replace(self.replacer[0]))
#         self.model[1].fit(x[y != 0], y[y != 0].replace(self.replacer[1]))
#         return self

#     def predict(self, x, y=None):
#         res_1 = pd.Series(self.model[0].predict(x).squeeze(), index=x.index)
#         temp = res_1 != 0
#         res_2 = pd.Series(self.model[1].predict(x[temp]).squeeze(), index=x[temp].index)
#         res_1[x[temp].index] = res_2.replace(self.invercer[1])
#         return res_1.values

# a = ConsecEstimator(CBC)
# pd.DataFrame([eval_pipe(a, ft)])
# #predict_on_test(a, ft, fp, refit=True)
# #print()

In [1]:
# import optuna
# from optuna.samplers import TPESampler

# x, y = ft[0][0], ft[0][1].replace(2, 1)
# for col in x.columns:
#     x[col] = x[col].astype(float)

# def objective(trial):
#     n_estimators = trial.suggest_int("n_estimators", 10, 500)
#     max_depth = trial.suggest_int("max_depth", 2, 20)
#     model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)

#     score = cross_val_score(model, x, y, cv=3, scoring=SCORERS[0])
#     return score.mean()

# study = optuna.create_study(sampler=TPESampler(), direction="maximize")
# study.optimize(objective, n_trials=10)

In [None]:
# %%time
# a = CBC()
# pd.DataFrame([a.eval_on_test(ft)])
# # predict_on_test(a, ft, fp, refit=True)
# # print()

In [None]:
%%time
cols_base = ['condition', 'faculty', 'start_year', 'start_year_val', 'mean_mark_type2', 'city', 'birthday_year', 'school_type', 'mean_mark', 'region', 'diff_between_school_n_start', 'has_not_family', 'gender', 'mean_mark_add1', 'mean_mark_add2', 'country', 'relativies_country', 'language', 'school', 'school_location']
cols_add = [tuple(),
            ('group_code', ),
            ('group_code_num', ),
            ('group_code_add_1', 'group_code_add_2', 'group_code_add_3', 'group_code_add_4',),
            ('k_5', ),
            ('same_group_target', ),
            ('same_group_target', 'group_code'),]
cols = [list(x) + cols_base for x in cols_add]
cols = [[y for y in x if y in f.columns] for x in cols]
models = [CBC(transformers=[col_retainer(x)]) for x in cols]
res = []
# res = [x.test(ft) for x in models]
# pd.concat(res)

In [None]:
%%time
linm = LogisticRegression(C=0.01, max_iter=1000, class_weight='balanced')
model = StackEstimator(models, linm)
res.append(model.test(ft))
pd.concat(res)

In [None]:
model.predict_final(ft, fp)

In [None]:
# m = CBC()
# m.test(ft)

In [None]:
# import sklearn.utils.estimator_checks as a
# a.check_estimator(CBC())

In [None]:
# # models[0].fit(*ft[0])
# models[0]._estimator_type
# models[0].get_params

In [None]:
# %%time
# get_cf = lambda x: list(x.select_dtypes(include='category').columns)
# cat_features = get_cf(ft[0][0])
# classes_count = len(ft[0][1].unique())
# # res = [fast_catboost(ft),
# #        consecutive_prediction(fast_catboost, ft),
# #        bagging_prediction(fast_catboost, ft)]
# # res = [fast_catboost(ft),
# #        consecutive_prediction(fast_catboost, ft)]
# temp_cols_1 = ['same_group_target', 'condition', 'faculty', 'group1_code_add_3', 'start_year', 'birthday_year', 'num_group_code', 'num_group_code_rexp3_scale_fun', 'num_group_code_exp3_scale_fun', 'sub_group_code_5', 'num_group_code_exp5_scale_fun', 'num_group_code_expexp_scale_fun', 'mean_mark_type2', 'num_group_code_sqrt_scale_fun', 'num_group_code_rexp5_scale_fun', 'group1_code_add_2', 'school_type', 'num_group_code_r_scale_fun', 'relativies_country', 'num_group_code_2_scale_fun', 'region', 'language', 'group1_code_add_1', 'city', 'country', 'school_location', 'school', 'group1_code_add_4', 'sub_group_code_20', 'mean_mark_sin_scale_fun']
# add_kwargs = [{'_transformers': [x]} for x in [col_retainer(temp_cols_1),
#                                                col_cutter(['same_group_target']),
#                                                col_cutter(['same_group_target', 'group_code']),
#                                                col_cutter([x for x in ft[0][0].columns if 'scale_fun' in x]),
#                                                col_cutter([x for x in ft[0][0].columns if 'scale_fun' in x or 'group_code' in x])]] + [dict()]
# add_cat_features = [get_cf(deepcopy(x['_transformers'][0]).fit_transform(ft[0][0])) for x in add_kwargs if len(x) > 0] + [cat_features]
# add_kwargs = [x | {'cat_features': y, 'classes_count': } for x, y in zip(add_kwargs, add_cat_features)]
#res = [fast_catboost(ft, **kwargs) for kwargs in add_kwargs]
#res = [fast_catboost(**kwargs) for kwargs in add_kwargs]
#res = [simple_stacking_catboost([x['model'] for x in res], ft)]
#res += [consecutive_prediction(fast_catboost, ft, **kwargs) for kwargs in add_kwargs]

In [None]:
# pd.DataFrame(res).drop(columns=['model']) * 100

In [None]:
# %%time
# for ires in res:
#     predict_on_test(ires['model'], ft, fp, refit=False)