In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight

import itertools
from matplotlib import pyplot as plt

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

from math import sin,cos
# !pip install scikit-optimize

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from xgboost import XGBClassifier

import pickle
import traceback

from sklearn.model_selection import cross_val_score
#crossval_scores = cross_val_score(best_clf, X_train, y_train, scoring='roc_auc', cv=5)
from sklearn.model_selection import cross_val_predict
#cross_val_pred = cross_val_predict(clf, train[features], train['smoking'], cv=10, method='predict_proba')[:, 1]

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
train=pd.read_csv('./playground-series-s3e24/train.csv')
test=pd.read_csv('./playground-series-s3e24/test.csv')

import datetime

def process_features(df,bin_size=0,leftright=False,feature_range=0,means=None,mins=None,maxs=None):
    
    df['imt']=df['weight(kg)']/(df['height(cm)']/100)**2
    
    df['pressure_diff']=df['systolic']/df['relaxation']
    
    # принудительные бины, так как остальная часть столбца с бинами 5
    for col in ['age','height(cm)','weight(kg)']:
        df[col]=(df[col])//5*5
 

    df['eyesight_min']=df.parallel_apply(lambda x:min(x['eyesight(left)'],x['eyesight(right)']),axis=1)
    df['eyesight_max']=df.parallel_apply(lambda x:max(x['eyesight(left)'],x['eyesight(right)']),axis=1)

    df['hearing_min']=df.parallel_apply(lambda x:min(x['hearing(left)'],x['hearing(right)']),axis=1)
    df['hearing_max']=df.parallel_apply(lambda x:max(x['hearing(left)'],x['hearing(right)']),axis=1)

    df=df.drop(columns=['eyesight(left)','eyesight(right)','hearing(left)','hearing(right)'])
    
    
    col='age'
    col_name=col+'_sin'
    df[col_name]=df[col].apply(lambda x:sin(x))

    for col in df.columns:
        df[col]=round(df[col],4)
        
        if ' ' in col:
            df=df.rename(columns={col:col.replace(' ','_')})
        if '(' in col:
            df=df.rename(columns={col:col.replace('(','_').replace(')','')})

    return df
train=process_features(train)
train=train.drop(columns='id')

In [610]:
opt = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb1, cv=2, n_iter=4, scoring='roc_auc', random_state=8,n_jobs=1)
opt.fit(train.drop(columns='smoking'), train.smoking)
opt.best_score_

0.8542924464575236

In [625]:
from itertools import product 
cv=[7,8,9]
n_iter=[1,2,3,4,5,6,7,8,9,10]

for comb in product(cv,n_iter):
    
    opt = BayesSearchCV(XGBClassifier(random_state=0),\
                        search_space_xgb1, cv=comb[0], n_iter=comb[1], scoring='roc_auc', random_state=8,n_jobs=1)
    x=datetime.datetime.now()
    opt.fit(train.drop(columns='smoking'), train.smoking)
    y=datetime.datetime.now()
    print(comb[0],comb[1],str(y-x),opt.best_score_)

7 1 0:00:03.736975 0.8510285957728365
7 2 0:00:07.115432 0.852415150776939
7 3 0:00:10.190524 0.852415150776939
7 4 0:00:13.595984 0.8568414358598017
7 5 0:00:17.782919 0.8641165646285268
7 6 0:00:21.237002 0.8641165646285268
7 7 0:00:30.569173 0.8641165646285268
7 8 0:00:34.863393 0.8641165646285268
7 9 0:00:35.802241 0.8641165646285268
7 10 0:00:44.722528 0.8641165646285268
8 1 0:00:04.877942 0.8503981390065669
8 2 0:00:08.549928 0.8526777230341573
8 3 0:00:12.697967 0.8526777230341573
8 4 0:00:17.363258 0.8565990586215249
8 5 0:00:21.147812 0.8639652169224514
8 6 0:00:31.619295 0.8639652169224514
8 7 0:00:30.695122 0.8639652169224514
8 8 0:00:33.746949 0.8639652169224514
8 9 0:00:37.846624 0.8639652169224514
8 10 0:00:40.941642 0.8639652169224514
9 1 0:00:04.834458 0.8523303118139278
9 2 0:00:09.168405 0.8530704358649265
9 3 0:00:13.333870 0.8530704358649265
9 4 0:00:17.793419 0.8573109208072107
9 5 0:00:22.813263 0.8643688903048841
9 6 0:00:28.337088 0.8643688903048841
9 7 0:00:34.

In [615]:
from sklearn.model_selection import cross_val_score
x=datetime.datetime.now()
scores = cross_val_score(XGBClassifier(random_state=0), train.drop(columns='smoking'), train.smoking, cv=20)
y=datetime.datetime.now()
print(str(y-x))
print(scores)

0:00:12.611754
[0.78349868 0.78374984 0.78211729 0.7715685  0.78425217 0.78061032
 0.77320105 0.77897777 0.78123823 0.78488007 0.78324752 0.780108
 0.7735778  0.78211729 0.78638704 0.78789401 0.79000251 0.79113288
 0.77895001 0.779578  ]


In [5]:


search_space_xgb = {
    'max_depth': Integer(2,8),
    'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.2, 1.0),
    'colsample_bylevel': Real(0.2, 1.0),
    'colsample_bynode' : Real(0.2, 1.0),
    'reg_alpha': Real(0.0, 60.0),
    'reg_lambda': Real(0.0, 60.0),
    'gamma': Real(0.0, 30.0),
    'min_child_weight': Integer(1, 300),
    'max_delta_step': Integer(1, 30),
    'n_estimators':  Integer(400, 5000)
}
    
search_space_xgb1=search_space_xgb.copy()
del search_space_xgb1['n_estimators']
del search_space_xgb1['learning_rate']

    
opt_big = BayesSearchCV(XGBClassifier(random_state=0,eval_metric='auc'),\
                    search_space_xgb1, cv=6, n_iter=10, scoring='roc_auc', random_state=8,n_jobs=1)


In [1788]:
def apply_filters(df,filters):
    #x=datetime.datetime.now()
    for f in filters:
        df=df.query(f)
    #y=datetime.datetime.now()
    #print(str(y-x))
    return df

def clear_filters(filters):
    #x=datetime.datetime.now()
    xxx=pd.DataFrame([x.split(' ') for x in filters],columns=['col','sign','col_value'])
    xxx.col_value=xxx.col_value.astype(float)
    new_xxx=pd.concat([xxx.query('sign==">"').sort_values('col_value',ascending=False).drop_duplicates(keep='first',subset=['col','sign']),\
           xxx.query('sign=="<="').sort_values('col_value',ascending=True).drop_duplicates(keep='first',subset=['col','sign'])])
    svod=[]
    for i,row in new_xxx.iterrows():
        s=f'{row.col} {row.sign} {row.col_value}'
        svod.append(s)
    svod.sort()
    #y=datetime.datetime.now()
    #print(str(y-x))
    return svod
    
def create_svod(df,base_columns,quantiles,opt,opt_big):
    global big_df
    global actual_tree
    global actual_queries
    svod=[]
    
    if not actual_tree.empty:
        #act=actual_tree.filters_split.sum() # на выходе список
        act=actual_tree.iloc[-1].filters_split
    else:
        act=[]
    
    if str(act) in actual_queries.keys():
        #print('Взяли svod для',act)
        return actual_queries[str(act)]
    
    for col in base_columns:
        for col_value in quantiles[col]:

        #for col_value in df[col].quantile(np.arange(0.05,1,step=0.15)).drop_duplicates():
            #print(col,col_value)
            
                
            filters_left=act+[f'{col} <= {col_value}',]
            filters_left=clear_filters(filters_left)
            
            #if str(filters_left) in actual_queries:
            #    continue
            #else:
            #    actual_queries.append(filters_left)
            left=apply_filters(df,filters_left)

            if left.empty:# or left.shape[0]<min_border05 or left.smoking.nunique()==1:
                continue
            
            filters_right=act+[f'{col} > {col_value}',]
            filters_right=clear_filters(filters_right)
            
            #if str(filters_right) in actual_queries:
            #    continue
            #else:
            #    actual_queries.append(filters_right)
            right=apply_filters(df,filters_right)
            

            
            if right.empty:# or right.shape[0]<min_border05 or right.smoking.nunique()==1:
                continue
            

            try:
                auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
                #fromfull_auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'fromfull_auc'].drop_duplicates().item()
                #if fromfull_auc_left is None:
                #    fromfull_auc_left=cross_val_score(best_clf, left.drop(columns='smoking'), left.smoking, scoring='roc_auc', cv=opt.cv).mean()
                #    big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'fromfull_auc']=fromfull_auc_left
            except:
                try: # вдруг мало элементов для рок-аука, либо только один класс
                    opt.fit(left.drop(columns='smoking'), left.smoking) 
                    auc_left=round(opt.best_score_,5)
                except:
                    auc_left=0
                #fromfull_auc_left=cross_val_score(best_clf, left.drop(columns='smoking'), left.smoking, scoring='roc_auc', cv=opt.cv).mean()

                if auc_left==0.5:
                    try:
                        opt_big.fit(left.drop(columns='smoking'), left.smoking)
                        auc_left=round(opt_big.best_score_,5)
                    except:
                        auc_left=0
                        
                    

                temp = {'filters': filters_left, 'auc': auc_left}#,'fromfull_auc':fromfull_auc_left
                big_df = big_df.append(temp, ignore_index = True)

            #if auc_left>0.5 and auc_left<min_border: # если слева уже плохая разбивка - правую не считаем. можем что оптимизирует
            #    continue

            try:
                auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
                #fromfull_auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'fromfull_auc'].drop_duplicates().item()
                #if fromfull_auc_right is None:
                #    fromfull_auc_right=cross_val_score(best_clf, right.drop(columns='smoking'), right.smoking, scoring='roc_auc', cv=opt.cv).mean()
                #    big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'fromfull_auc']=fromfull_auc_right
            except:
                try: # вдруг мало элементов для рок-аука, либо только один класс
                    opt.fit(right.drop(columns='smoking'), right.smoking)
                    auc_right=round(opt.best_score_,5)
                except:
                    auc_right=0
                #fromfull_auc_right=cross_val_score(best_clf, right.drop(columns='smoking'), right.smoking, scoring='roc_auc', cv=opt.cv).mean()

                if auc_right==0.5:
                    try:
                        opt_big.fit(right.drop(columns='smoking'), right.smoking)
                        auc_right=round(opt_big.best_score_,5)
                    except:
                        auc_right==0

                temp = {'filters': filters_right, 'auc': auc_right}#,'fromfull_auc':fromfull_auc_right
                big_df = big_df.append(temp, ignore_index = True)
                


            # ,fromfull_auc_left,fromfull_auc_right
            svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
            #except Exception as e:
            #    print('!!! '+str(e)+' !!!')
    actual_queries[str(act)]=svod # если дошли досюда - значит, раньше в словаре не было
    return svod

def leaf_and_split(svod,min_border,max_border,min_leaf,base_auc,min_border05):
    
    
    temp=pd.DataFrame(svod)
    temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
    conditions = [ 
        (temp.auc_left>=max_border) & (temp.auc_right>=min_border) & (temp.cnt_left>=min_leaf) & (temp.cnt_right>=min_leaf),
        (temp.auc_right>=max_border) & (temp.auc_left>=min_border) & (temp.cnt_right>=min_leaf) & (temp.cnt_left>=min_leaf),
        (temp.auc_left==0.5) & (temp.auc_right>=min_border) & (temp.cnt_left>=min_border05) & (temp.cnt_right>=min_leaf),
        (temp.auc_right==0.5) & (temp.auc_left>=min_border) & (temp.cnt_right>=min_border05) & (temp.cnt_left>=min_leaf),
        (temp.auc_left>=base_auc) & (temp.auc_right>=base_auc) & (temp.cnt_left>=min_leaf) & (temp.cnt_right>=min_leaf) & (temp.auc_left>temp.auc_right),
        (temp.auc_left>=base_auc) & (temp.auc_right>=base_auc) & (temp.cnt_left>=min_leaf) & (temp.cnt_right>=min_leaf) & (temp.auc_left<=temp.auc_right)]

    choices = [temp.filters_right,temp.filters_left,temp.filters_right,temp.filters_left,temp.filters_right,temp.filters_left]
    temp['filters_split'] = np.select(conditions, choices, default=False)
    
    choices = [temp.cnt_right,temp.cnt_left,temp.cnt_right,temp.cnt_left,temp.cnt_right,temp.cnt_left]
    temp['cnt_split'] = np.select(conditions, choices, default=False)
    
    choices = [temp.auc_right,temp.auc_left,temp.auc_right,temp.auc_left,temp.auc_right,temp.auc_left]
    temp['auc_split'] = np.select(conditions, choices, default=False)
    
    
    
    choices = [temp.filters_left,temp.filters_right,temp.filters_left,temp.filters_right,temp.filters_left,temp.filters_right]
    temp['filters_leaf'] = np.select(conditions, choices, default=False)

    choices = [temp.cnt_left,temp.cnt_right,temp.cnt_left,temp.cnt_right,temp.cnt_left,temp.cnt_right]
    temp['cnt_leaf'] = np.select(conditions, choices, default=False)

    choices = [temp.auc_left,temp.auc_right,temp.auc_left,temp.auc_right,temp.auc_left,temp.auc_right]
    temp['auc_leaf'] = np.select(conditions, choices, default=False)

    temp=temp.query('cnt_split>0').sort_values('cnt_split',ascending=True) # чем меньше разделяемая часть - тем больше лист
    return temp

def leaf_and_split05(svod,min_border,max_border,min_leaf,base_auc,min_border05,auc_add):
    
    global actual_tree
    if not actual_tree.empty:
        base_auc=actual_tree.iloc[-1].auc_split+auc_add
    else:
        base_auc=base_auc+auc_add
    #print(base_auc)
    
    if svod==[]:
        return pd.DataFrame()
    temp=pd.DataFrame(svod)
    temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
    
    #temp.loc[temp.auc_left<0.8,'auc_left']=0.5
    
    conditions = [ 
        (temp.auc_left>temp.auc_right) & (temp.auc_left>=base_auc) & (temp.cnt_right>=min_border05) & (temp.auc_right>0.51),
        (temp.auc_left>temp.auc_right) & (temp.auc_left>=base_auc) & (temp.cnt_left>=min_border05) & (temp.cnt_right>=min_border05) & (temp.auc_right<=0.51),
        (temp.auc_left<=temp.auc_right) & (temp.auc_right>base_auc) & (temp.cnt_left>=min_border05) & (temp.auc_left>0.51),
        (temp.auc_left<=temp.auc_right) & (temp.auc_right>base_auc) & (temp.cnt_right>=min_border05) & (temp.cnt_left>=min_border05) & (temp.auc_left<=0.51)
                ]

    choices = [temp.filters_right,temp.filters_left,temp.filters_left,temp.filters_right]
    temp['filters_split'] = np.select(conditions, choices, default=False)
    
    choices = [temp.cnt_right,temp.cnt_left,temp.cnt_left,temp.cnt_right]
    temp['cnt_split'] = np.select(conditions, choices, default=False)
    
    choices = [temp.auc_right,temp.auc_left,temp.auc_left,temp.auc_right]
    temp['auc_split'] = np.select(conditions, choices, default=False)
    
    
    
    choices = [temp.filters_left,temp.filters_right,temp.filters_right,temp.filters_left]
    temp['filters_leaf'] = np.select(conditions, choices, default=False)

    choices = [temp.cnt_left,temp.cnt_right,temp.cnt_right,temp.cnt_left]
    temp['cnt_leaf'] = np.select(conditions, choices, default=False)

    choices = [temp.auc_left,temp.auc_right,temp.auc_right,temp.auc_left]
    temp['auc_leaf'] = np.select(conditions, choices, default=False)
    #to_csv(temp)
    temp=temp.query('cnt_split>0').sort_values('cnt_split',ascending=True) # чем меньше разделяемая часть - тем больше лист
    return temp

def step(temp,actual_tree,cut_max):
    global final_trees
    global work_dfs
    if temp.empty:
 
        temp_tree=actual_tree.tail(1).copy()
        temp_tree.filters_leaf=temp_tree.filters_split
        temp_tree.cnt_leaf=temp_tree.cnt_split
        temp_tree.auc_leaf=temp_tree.auc_split
        temp_tree=temp_tree[['filters_leaf','cnt_leaf','auc_leaf']]

        actual_tree=pd.concat([actual_tree,temp_tree],ignore_index = True,axis=0)
        metric=sum(actual_tree.auc_leaf*actual_tree.cnt_leaf)/sum(actual_tree.cnt_leaf)
        
        temp = {'data': actual_tree, 'metric': metric, 'cnt_leaves':actual_tree.shape[0],'cut_max':cut_max,'sumcolvalue':actual_tree.col_value.sum()}
        final_trees = final_trees.append(temp, ignore_index = True)
        #final_trees.append(actual_tree)
        
        print(actual_tree.shape[0],round(metric,5),[ra.shape[0] for ra in work_dfs])
        actual_tree=actual_tree[:-1]
        
        while True:
            if work_dfs==[]:
                break
            actual_tree=actual_tree[:-1]
            work_dfs[-1]=work_dfs[-1][1:]
            if not work_dfs[-1].empty:
                break
            else:
                work_dfs=work_dfs[:-1]
        if work_dfs==[]:
            raise Exception('конец вечеринки')
        else:
            actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
    else:
        work_dfs.append(temp)
        #to_csv(temp)
        actual_tree=pd.concat([actual_tree,temp.head(1)])
    return actual_tree

In [7]:
X=train.drop(columns=['smoking'])
y=train.smoking

X_train, X_test, y_train, y_test = train_test_split(X, 
            y,
            test_size=0.2, 
            random_state=1)

xx=pd.concat([X_train,y_train],axis=1)

In [104]:
opt = BayesSearchCV(XGBClassifier(random_state=0,eval_metric='auc',n_estimators=1500),\
                    search_space_xgb1, cv=10, n_iter=50, scoring='roc_auc', random_state=8,n_jobs=1)
x=datetime.datetime.now()
opt.fit(X_train, y_train)
y=datetime.datetime.now()
print(str(y-x))

0:33:40.850372


In [106]:
opt.best_score_

0.8684634358142805

In [107]:
models=[]
for feature in X_train.columns:
    temp=pd.get_dummies(X_train[feature],drop_first=True,prefix=feature)
    X_train_new=X_train.drop(columns=feature)
    X_train_new=pd.concat([X_train_new,temp],axis=1)
    opt.fit(X_train_new, y_train)
    models.append(opt)
    print(feature,X_train[feature].nunique(),opt.best_score_)

age 14 0.8688507640892741
height_cm 12 0.8686262259225158
weight_kg 21 0.8693768889940504
waist_cm 520 0.8693889942968729
systolic 108 0.8676402611941141
relaxation 74 0.8686774746040717
fasting_blood_sugar 222 0.8681136452544278
Cholesterol 225 0.8688280770352105
triglyceride 391 0.8681600796134994
HDL 107 0.8689627251165165
LDL 216 0.8686892092217038
hemoglobin 131 0.867143571718793
Urine_protein 6 0.8688277179309152
serum_creatinine 28 0.8679601840565331
AST 133 0.8685726367844863
ALT 181 0.8681111882916067
Gtp 349 0.8673908627539945
dental_caries 2 0.8691887476755135
imt 157 0.868498494599895
pressure_diff 2222 0.8692023821629202
eyesight_min 17 0.8681895930666161
eyesight_max 18 0.868396685281757
hearing_min 2 0.869089693862005
hearing_max 2 0.8695824441148783
age_sin 14 0.8687672019116995


In [101]:
feature='dental_caries'
temp=pd.get_dummies(X_train[feature],drop_first=True,prefix=feature)
temp

Unnamed: 0,dental_caries_1
153156,1
113337,0
90733,0
67435,0
113634,0
...,...
73349,0
109259,0
50057,0
5192,0


In [102]:
X_train[feature]

153156    1
113337    0
90733     0
67435     0
113634    0
         ..
73349     0
109259    0
50057     0
5192      0
128037    1
Name: dental_caries, Length: 127404, dtype: int64

In [10]:
pred=opt.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test,pred))

0.8655540770531274


In [63]:
scores = cross_val_score(opt, X_train, y_train, cv=opt.cv)

In [64]:
scores.mean()

0.8623635702516639

In [52]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=opt.cv, random_state=8,shuffle=True)



In [53]:
# Тут мы храним ошибки модели
losses_test = []
losses_train = []
clf=opt.best_estimator_

for train_index, test_index in kf.split(X_train):
    X_train0, X_test0 = X_train.values[train_index], X_train.values[test_index]
    y_train0, y_test0 = y_train.values[train_index], y_train.values[test_index]
    # На каждой итерации строим лин.регрессию на train
    clf.fit(X_train0, y_train0)
    
    pred=clf.predict_proba(X_test0)[:,1]
    losses_test.append(roc_auc_score(y_test0,pred))
    
    pred=clf.predict_proba(X_train0)[:,1]
    losses_train.append(roc_auc_score(y_train0,pred))


In [54]:
print(sum(losses_train)/5,sum(losses_test)/5)

0.8688174958221513 0.8621281803316367


In [14]:
cross_val_pred = cross_val_predict(opt, X_train, y_train, cv=opt.cv, method='predict_proba')[:, 1]
cross_val_pred

array([0.4829404 , 0.5300007 , 0.51716655, ..., 0.72636044, 0.49553254,
       0.7445972 ], dtype=float32)

In [15]:
print(roc_auc_score(y_train,cross_val_pred))

0.8623669454826048


In [1790]:
opt = BayesSearchCV(XGBClassifier(random_state=0,eval_metric='auc'),\
                    search_space_xgb1, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)
#'serum_creatinine','age_sin','Cholesterol','imt',
#base_columns=['height_cm','imt']
base_columns=xx.drop(columns='smoking').columns
quantiles={}
for col in base_columns:
    quantiles[col]=xx[col].quantile(np.arange(0.05,1,step=0.1)).drop_duplicates().to_list()

opt.fit(xx.drop(columns='smoking'), xx.smoking)
best_clf=opt.best_estimator_

base_auc=round(opt.best_score_,5)
print(base_auc)
min_border=base_auc-0.05
max_border=base_auc

cut_max=20
min_leaf=xx.shape[0]*cut_max//100
min_border05=2000
auc_add=0.02
    
#final_trees=[]
final_trees=pd.DataFrame()
work_dfs=[]
#actual_queries={}
actual_tree=pd.DataFrame(columns=['filters_split'])
#big_df=pd.DataFrame(columns=['filters','auc','fromfull_auc'])
#big_df['fromfull_auc']=None

0.86236


In [None]:
actual_queries

In [None]:
for cut_max in [20,]:
    x=datetime.datetime.now()
    min_leaf=xx.shape[0]*cut_max//100

    work_dfs=[]
    actual_tree=pd.DataFrame(columns=['filters_split'])

    while True:

        svod=create_svod(xx,base_columns,quantiles,opt,opt_big)
        temp=leaf_and_split05(svod,min_border,max_border,min_leaf,base_auc,min_border05,auc_add=auc_add)
        try:
            actual_tree=step(temp,actual_tree,cut_max)
        except Exception as e:
            print('!!! '+str(e)+' !!!')
            print(traceback.format_exc())
            break
    y=datetime.datetime.now()
    print(str(y-x))
    #with open(f"final_trees03_withfullauc_{cut_max}.pickle", "wb") as file:
    #    pickle.dump(final_trees, file)    

    to_csv(big_df,'big_df',new_version=True)
    print(final_trees.shape[0])

3 0.83921 [53, 7]
3 0.83906 [53, 6]
3 0.83743 [53, 5]
3 0.83781 [53, 4]
3 0.83722 [53, 3]




4 0.83737 [53, 2, 2]
4 0.83743 [53, 2, 1]




3 0.83675 [53, 1]
2 0.83124 [52]




3 0.84594 [51, 41]




3 0.84542 [51, 40]




3 0.84455 [51, 39]




3 0.84204 [51, 38]




3 0.84693 [51, 37]




4 0.84537 [51, 36, 16]




4 0.84537 [51, 36, 15]




4 0.84413 [51, 36, 14]




4 0.84492 [51, 36, 13]




4 0.84384 [51, 36, 12]




5 0.84445 [51, 36, 11, 4]




5 0.84424 [51, 36, 11, 3]




5 0.84388 [51, 36, 11, 2]




5 0.84482 [51, 36, 11, 1]




4 0.84382 [51, 36, 10]
4 0.84513 [51, 36, 9]




4 0.84385 [51, 36, 8]




5 0.84405 [51, 36, 7, 1]
4 0.84373 [51, 36, 6]




4 0.8429 [51, 36, 5]




5 0.84392 [51, 36, 4, 4]




5 0.84381 [51, 36, 4, 3]
5 0.84389 [51, 36, 4, 2]




5 0.84426 [51, 36, 4, 1]




5 0.84446 [51, 36, 3, 15]




5 0.84485 [51, 36, 3, 14]




5 0.84366 [51, 36, 3, 13]




5 0.8441 [51, 36, 3, 12]




5 0.84327 [51, 36, 3, 11]




5 0.84451 [51, 36, 3, 10]




5 0.84339 [51, 36, 3, 9]




5 0.84482 [51, 36, 3, 8]




5 0.84361 [51, 36, 3, 7]




5 0.84393 [51, 36, 3, 6]




6 0.84334 [51, 36, 3, 5, 5]




6 0.84263 [51, 36, 3, 5, 4]




6 0.84307 [51, 36, 3, 5, 3]




6 0.84264 [51, 36, 3, 5, 2]




6 0.8424 [51, 36, 3, 5, 1]




5 0.84393 [51, 36, 3, 4]
5 0.84317 [51, 36, 3, 3]




6 0.84342 [51, 36, 3, 2, 1]




6 0.84253 [51, 36, 3, 1, 3]




6 0.84293 [51, 36, 3, 1, 2]
6 0.84279 [51, 36, 3, 1, 1]




5 0.84346 [51, 36, 2, 5]




5 0.84226 [51, 36, 2, 4]




5 0.8428 [51, 36, 2, 3]
5 0.84393 [51, 36, 2, 2]
5 0.84256 [51, 36, 2, 1]




5 0.84467 [51, 36, 1, 14]




5 0.84499 [51, 36, 1, 13]




5 0.84366 [51, 36, 1, 12]




5 0.84418 [51, 36, 1, 11]




5 0.84336 [51, 36, 1, 10]




6 0.84339 [51, 36, 1, 9, 3]




6 0.84366 [51, 36, 1, 9, 2]




6 0.84411 [51, 36, 1, 9, 1]




5 0.84319 [51, 36, 1, 8]
5 0.84457 [51, 36, 1, 7]




5 0.8434 [51, 36, 1, 6]




5 0.84412 [51, 36, 1, 5]




5 0.84359 [51, 36, 1, 4]
5 0.84299 [51, 36, 1, 3]




6 0.84334 [51, 36, 1, 2, 2]
6 0.84307 [51, 36, 1, 2, 1]




6 0.84331 [51, 36, 1, 1, 5]




6 0.84239 [51, 36, 1, 1, 4]




6 0.84287 [51, 36, 1, 1, 3]




6 0.84311 [51, 36, 1, 1, 2]
6 0.84255 [51, 36, 1, 1, 1]




4 0.84226 [51, 35, 1]




4 0.84378 [51, 34, 14]




4 0.84211 [51, 34, 13]




4 0.84292 [51, 34, 12]




4 0.84177 [51, 34, 11]
4 0.84176 [51, 34, 10]




5 0.8425 [51, 34, 9, 5]




5 0.84141 [51, 34, 9, 4]




5 0.84189 [51, 34, 9, 3]
5 0.84142 [51, 34, 9, 2]
5 0.84274 [51, 34, 9, 1]
4 0.84398 [51, 34, 8]




5 0.84115 [51, 34, 7, 2]




5 0.84224 [51, 34, 7, 1]




4 0.84193 [51, 34, 6]
4 0.84204 [51, 34, 5]




4 0.84082 [51, 34, 4]




5 0.84189 [51, 34, 3, 6]




5 0.84112 [51, 34, 3, 5]




5 0.8415 [51, 34, 3, 4]
5 0.84076 [51, 34, 3, 3]




5 0.84278 [51, 34, 3, 2]




5 0.84109 [51, 34, 3, 1]




5 0.84251 [51, 34, 2, 4]
5 0.84122 [51, 34, 2, 3]
5 0.84293 [51, 34, 2, 2]
5 0.84222 [51, 34, 2, 1]




4 0.84204 [51, 34, 1]




4 0.84555 [51, 33, 17]




4 0.84399 [51, 33, 16]




4 0.84491 [51, 33, 15]




4 0.84354 [51, 33, 14]




5 0.84327 [51, 33, 13, 6]




5 0.84277 [51, 33, 13, 5]




5 0.84345 [51, 33, 13, 4]




5 0.84316 [51, 33, 13, 3]




5 0.84242 [51, 33, 13, 2]




5 0.84392 [51, 33, 13, 1]




4 0.84367 [51, 33, 12]




5 0.84319 [51, 33, 11, 4]




5 0.84241 [51, 33, 11, 3]




5 0.84253 [51, 33, 11, 2]




5 0.84297 [51, 33, 11, 1]




4 0.84529 [51, 33, 10]




5 0.84337 [51, 33, 9, 8]




6 0.84303 [51, 33, 9, 7, 1]




5 0.84403 [51, 33, 9, 6]




5 0.84306 [51, 33, 9, 5]




5 0.84376 [51, 33, 9, 4]




5 0.8433 [51, 33, 9, 3]




5 0.8429 [51, 33, 9, 2]




6 0.84281 [51, 33, 9, 1, 1]




4 0.84357 [51, 33, 8]




In [None]:
with open(f"actual_queries.pickle", "wb") as file:
        pickle.dump(actual_queries, file)    

In [1787]:
big_df.loc[big_df['filters'].apply(lambda x:x==filters),'auc']

Series([], Name: auc, dtype: float64)

In [1786]:
filters=['height_cm <= 150.0', 'serum_creatinine > 0.7', 'weight_kg > 70.0']

In [1784]:
apply_filters(xx,filters)

Unnamed: 0,age,height_cm,weight_kg,waist_cm,systolic,relaxation,fasting_blood_sugar,Cholesterol,triglyceride,HDL,...,Gtp,dental_caries,imt,pressure_diff,eyesight_min,eyesight_max,hearing_min,hearing_max,age_sin,smoking
113337,30,190,75,91.6,96,58,94,212,171,35,...,18,0,20.7756,1.6552,1.2,1.2,1.0,1.0,-0.9880,1
90733,35,175,60,75.0,126,76,110,152,59,54,...,23,0,19.5918,1.6579,1.5,1.5,1.0,1.0,-0.4282,0
73521,45,175,60,74.0,100,60,88,199,83,62,...,19,0,19.5918,1.6667,0.8,1.0,1.0,1.0,0.8509,1
4206,40,175,60,73.0,114,65,82,203,43,81,...,19,0,19.5918,1.7538,0.6,0.7,1.0,1.0,0.7451,1
126262,40,185,70,81.0,142,100,103,191,120,49,...,91,0,20.4529,1.4200,1.2,1.2,1.0,1.0,0.7451,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67948,55,175,60,75.0,116,63,87,161,46,53,...,52,0,19.5918,1.8413,0.9,1.2,1.0,1.0,-0.9998,1
44694,25,175,60,74.0,112,69,89,150,84,49,...,16,1,19.5918,1.6232,0.7,0.8,1.0,1.0,-0.1324,1
83094,20,175,60,75.0,118,74,129,144,64,49,...,17,0,19.5918,1.5946,0.9,0.9,1.0,1.0,0.9129,1
142346,30,175,60,74.0,103,60,93,173,195,43,...,40,0,19.5918,1.7167,1.0,1.0,1.0,1.0,-0.9880,1


In [1782]:
big_df.loc[big_df['filters'].apply(lambda x:x==filters),'auc'].drop_duplicates().item()

0.5

In [1772]:
big_df.shape

(144315, 3)

In [1763]:
final_trees.iloc[0].data

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
0,[imt > 27.6817],imt,27.6817,0.86814,0.80128,108753.0,18651.0,[imt <= 27.6817],[imt > 27.6817],18651.0,0.80128,[imt <= 27.6817],108753.0,0.86814
1,"[height_cm > 175.0, imt > 27.6817]",height_cm,175.0,0.81017,0.63941,16406.0,2245.0,"[height_cm <= 175.0, imt > 27.6817]","[height_cm > 175.0, imt > 27.6817]",2245.0,0.63941,"[height_cm <= 175.0, imt > 27.6817]",16406.0,0.81017
2,,,,,,,,,,,,"[height_cm > 175.0, imt > 27.6817]",2245.0,0.63941


In [1762]:
[d.auc_leaf.min() for d in final_trees.data]

[0.63941,
 0.68135,
 0.69396,
 0.70131,
 0.68135,
 0.64555,
 0.68159,
 0.70301,
 0.63941,
 0.68135,
 0.69396,
 0.70131,
 0.71534,
 0.6661,
 0.682,
 0.69717,
 0.69396,
 0.69717,
 0.67449,
 0.70169,
 0.63941,
 0.68135,
 0.69396,
 0.70131,
 0.69396,
 0.70169,
 0.6661,
 0.682,
 0.69717,
 0.69396,
 0.69717,
 0.68135,
 0.67332,
 0.70632,
 0.63941,
 0.68135,
 0.69396,
 0.70131,
 0.69396,
 0.70632,
 0.6661,
 0.682,
 0.69717,
 0.69396,
 0.69717,
 0.73033,
 0.69114,
 0.70978,
 0.69396,
 0.70978,
 0.6661,
 0.682,
 0.69717,
 0.69396,
 0.69717,
 0.70978,
 0.70131,
 0.71534,
 0.68135,
 0.69717,
 0.70169,
 0.73033,
 0.70978,
 0.68135,
 0.71532,
 0.70131,
 0.71534,
 0.71737,
 0.69717,
 0.70169,
 0.73033,
 0.73123,
 0.70978,
 0.71532,
 0.70131,
 0.71534,
 0.68159,
 0.69717,
 0.69717,
 0.70169,
 0.73033,
 0.73435,
 0.70978,
 0.71532,
 0.70131,
 0.71534,
 0.71737,
 0.69717,
 0.70169,
 0.73033,
 0.73123,
 0.70978,
 0.71532,
 0.71532,
 0.6661,
 0.682,
 0.69717,
 0.69396,
 0.69717,
 0.69396,
 0.71532,
 0.67

In [1551]:
min([row.data.auc_leaf.min() for i,row in final_trees.iterrows()])

0.682

In [1534]:
final_trees.sort_values('cnt_leaves',ascending=False)

Unnamed: 0,data,metric,cnt_leaves,cut_max,sumcolvalue
2649,filter...,0.810581,11.0,20.0,676.4318
2643,filter...,0.810479,11.0,20.0,817.5567
2642,filter...,0.810469,11.0,20.0,819.0038
2637,filter...,0.810736,11.0,20.0,677.0465
2607,filters...,0.825047,10.0,20.0,659.0038
...,...,...,...,...,...
15,filters_split c...,0.857027,3.0,20.0,201.2346
28,filters_split c...,0.853757,3.0,20.0,200.7117
255,filters_split c...,0.803386,3.0,20.0,190.8642
19,filters_split c...,0.854120,3.0,20.0,196.2346


In [1495]:
to_csv(final_trees.iloc[64677].data)

In [1484]:
with open(f"final_trees_withqueries.pickle", "wb") as file:
        pickle.dump(final_trees, file)    

In [1444]:
with open("final_trees_heightimt_before(139 , 12 min).pickle", "rb") as file:
    before = pickle.load(file)

In [1527]:
final_trees

Unnamed: 0,data,metric,cnt_leaves,cut_max,sumcolvalue
0,filters_split c...,0.859228,3.0,20.0,200.8642
1,filters_split c...,0.858153,3.0,20.0,195.8642
2,filters_split c...,0.856866,3.0,20.0,190.8642
3,filters_split c...,0.856148,3.0,20.0,202.6817
4,filters_split c...,0.856176,3.0,20.0,201.2346
...,...,...,...,...,...
2645,filters...,0.813643,9.0,20.0,480.1972
2646,filters...,0.811790,10.0,20.0,507.8789
2647,filters...,0.812091,10.0,20.0,650.1972
2648,filters...,0.811485,10.0,20.0,506.4318


In [1517]:
xxx=[]
for i,row in final_trees.iterrows():
    x=row.data.query('auc_leaf>=0.8')
    cnt=sum(x.auc_leaf*x.cnt_leaf)/sum(x.cnt_leaf)
    #print(x.cnt_leaf.sum(),cnt)
    xxx.append([x.cnt_leaf.sum(),cnt])

In [1520]:
xxx2=pd.DataFrame(xxx,columns=['cnt','metric'])
to_csv(xxx2.sort_values('metric',ascending=False))

In [1555]:
xxx2.query('cnt>110000 and metric>0.8689')

Unnamed: 0,cnt,metric
102,111530.0,0.868916
103,111530.0,0.868916
104,111530.0,0.868916
105,111530.0,0.868916
106,111530.0,0.868916


In [1424]:
max(good)

5172

In [1385]:
with open(f"final_trees_testafter.pickle", "wb") as file:
        pickle.dump(final_trees, file)    

In [1384]:
with open("final_trees_testbefore.pickle", "rb") as file:
    before = pickle.load(file)

In [1411]:
with open(f"big_df_20231229.pickle", "wb") as file:
        pickle.dump(big_df, file)  

0.5

In [1556]:
with open("final_trees_withqueries.pickle", "rb") as file:
    good = pickle.load(file)

In [1663]:
svod05=[]
for i,row in good.iterrows():
    t=row.data.query('auc_leaf<=0.51')
    if not t.empty:
        svod05.append(t)

In [1667]:
df05=pd.concat(svod05)['filters_leaf'].drop_duplicates()

In [1678]:
for filters in df05:

    right=apply_filters(xx,filters)
    opt.fit(right.drop(columns='smoking'), right.smoking)
    print(opt.best_score_,filters)


0.6309806489057502 ['height_cm > 180.0', 'imt > 22.8928']
0.6788418263646923 ['height_cm > 175.0', 'imt <= 26.2346', 'imt > 25.7117']
0.6824331864796325 ['height_cm <= 165.0', 'height_cm > 160.0', 'imt > 30.8642']
0.6788418263646923 ['height_cm <= 180.0', 'height_cm > 175.0', 'imt <= 26.2346', 'imt > 25.7117']
0.6309806489057502 ['height_cm > 180.0', 'imt > 22.2222']
0.6692225817240706 ['height_cm > 180.0', 'imt > 20.8117']
0.6686021718638392 ['height_cm > 170.0', 'imt <= 20.8117', 'imt > 19.5312']


In [1621]:
good['cnt05']=good.data.apply(lambda x:x.query('auc_leaf<=0.51').shape[0])

In [1627]:
good['row05']=good.data.apply(lambda x:x.query('auc_leaf<=0.51').cnt_leaf.sum())

In [1634]:
good.query('cnt05>=2').sort_values('row05',ascending=False)

Unnamed: 0,data,metric,cnt_leaves,cut_max,sumcolvalue,cnt05,row05
34870,filters...,0.836270,8.0,20.0,469.8116,2,2445.0
36154,filters...,0.795182,10.0,20.0,798.9601,2,2445.0
36130,filters...,0.796729,9.0,20.0,774.9297,2,2445.0
36131,filters...,0.796728,9.0,20.0,632.7639,2,2445.0
36132,filters...,0.796414,9.0,20.0,776.8997,2,2445.0
...,...,...,...,...,...,...,...
45920,filters...,0.811707,9.0,20.0,626.2221,2,2066.0
46053,filters...,0.810714,10.0,20.0,652.0672,2,2066.0
46080,filters...,0.810095,10.0,20.0,650.0972,2,2066.0
46465,filters...,0.824886,9.0,20.0,626.2221,2,2066.0


In [1680]:
x=good.iloc[34870].data#.query('auc_leaf>=0.8')

x

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
0,[imt > 19.5312],imt,19.5312,0.89838,0.85646,8878.0,118526.0,[imt <= 19.5312],[imt > 19.5312],118526.0,0.85646,[imt <= 19.5312],8878.0,0.89838
1,"[height_cm > 170.0, imt > 19.5312]",height_cm,170.0,0.87577,0.71763,91384.0,27142.0,"[height_cm <= 170.0, imt > 19.5312]","[height_cm > 170.0, imt > 19.5312]",27142.0,0.71763,"[height_cm <= 170.0, imt > 19.5312]",91384.0,0.87577
2,"[height_cm > 170.0, imt > 20.8117]",imt,20.8117,0.5,0.71737,1257.0,25885.0,"[height_cm > 170.0, imt <= 20.8117, imt > 19.5...","[height_cm > 170.0, imt > 20.8117]",25885.0,0.71737,"[height_cm > 170.0, imt <= 20.8117, imt > 19.5...",1257.0,0.5
3,"[height_cm <= 180.0, height_cm > 170.0, imt > ...",height_cm,180.0,0.71741,0.5,24697.0,1188.0,"[height_cm <= 180.0, height_cm > 170.0, imt > ...","[height_cm > 180.0, imt > 20.8117]",24697.0,0.71741,"[height_cm > 180.0, imt > 20.8117]",1188.0,0.5
4,"[height_cm <= 180.0, height_cm > 170.0, imt > ...",imt,22.8928,0.71537,0.71101,6309.0,18388.0,"[height_cm <= 180.0, height_cm > 170.0, imt <=...","[height_cm <= 180.0, height_cm > 170.0, imt > ...",18388.0,0.71101,"[height_cm <= 180.0, height_cm > 170.0, imt <=...",6309.0,0.71537
5,"[height_cm <= 180.0, height_cm > 170.0, imt > ...",imt,25.7117,0.70802,0.69786,6474.0,11914.0,"[height_cm <= 180.0, height_cm > 170.0, imt <=...","[height_cm <= 180.0, height_cm > 170.0, imt > ...",11914.0,0.69786,"[height_cm <= 180.0, height_cm > 170.0, imt <=...",6474.0,0.70802
6,"[height_cm <= 180.0, height_cm > 170.0, imt > ...",imt,30.8642,0.702,0.61668,10201.0,1713.0,"[height_cm <= 180.0, height_cm > 170.0, imt <=...","[height_cm <= 180.0, height_cm > 170.0, imt > ...",1713.0,0.61668,"[height_cm <= 180.0, height_cm > 170.0, imt <=...",10201.0,0.702
7,,,,,,,,,,,,"[height_cm <= 180.0, height_cm > 170.0, imt > ...",1713.0,0.61668


In [1691]:
# по конкретному дереву
bad_train=[]
bad_test=[]
for i,row in x.iterrows():
    if row.auc_leaf<0.51:
        right=apply_filters(xx,row.filters_leaf)
        bad_train.extend(list(right.index))
        
        right=apply_filters(xx_test,row.filters_leaf)
        to_csv(right,'hz',new_version=True)
        bad_test.extend(list(right.index))
        
good_xx=xx[~xx.index.isin(bad_train)]
good_xx_test=xx_test[~xx_test.index.isin(bad_test)]

In [1693]:
good_xx.smoking.mean()

0.432013700493762

In [1673]:
# по всем найденным 05
bad_train=[]
bad_test=[]
for filters in df05:

    right=apply_filters(xx,filters)
    bad_train.extend(list(right.index))

    right=apply_filters(xx_test,filters)
    bad_test.extend(list(right.index))
        
good_xx=xx[~xx.index.isin(bad_train)]
good_xx_test=xx_test[~xx_test.index.isin(bad_test)]

In [1764]:
x=final_trees.iloc[0].data
# по конкретному дереву
bad_train=[]
bad_test=[]
for i,row in x.iterrows():
    if row.auc_leaf<0.7:
        right=apply_filters(xx,row.filters_leaf)
        bad_train.extend(list(right.index))
        
        right=apply_filters(xx_test,row.filters_leaf)
        bad_test.extend(list(right.index))
        
good_xx=xx[~xx.index.isin(bad_train)]
good_xx_test=xx_test[~xx_test.index.isin(bad_test)]

In [1675]:
opt.fit(xx.drop(columns='smoking'), xx.smoking)

print('обучение на всём трейне, общий рок аук',opt.best_score_)


pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
#final=xx_test.drop(columns='smoking').copy()
#final['smoking']=pred
#final=final[['smoking',]]
print('рок аук всего теста, обучено на всём трейне',roc_auc_score(xx_test.smoking,pred))

pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
final=good_xx_test.drop(columns='smoking').copy()
final['smoking']=pred
final=final[['smoking',]]
print('рок аук хорошего теста, обучено на всём трейне',roc_auc_score(good_xx_test.smoking,pred))

обучение на всём трейне, общий рок аук 0.8652864866498593
рок аук всего теста, обучено на всём трейне 0.8672019815411491
рок аук хорошего теста, обучено на всём трейне 0.870075283168404


In [1676]:
opt.fit(good_xx.drop(columns='smoking'), good_xx.smoking)

print('обучение на хорошем трейне, общий рок аук',opt.best_score_)


pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
final=xx_test.drop(columns='smoking').copy()
final['smoking']=pred
final=final[['smoking',]]
print('рок аук всего теста, обучено на хорошем трейне',roc_auc_score(xx_test.smoking,pred))

pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
final=good_xx_test.drop(columns='smoking').copy()
final['smoking']=pred
final=final[['smoking',]]
print('рок аук хорошего теста, обучено на хорошем трейне',roc_auc_score(good_xx_test.smoking,pred))

обучение на хорошем трейне, общий рок аук 0.8688396334835691
рок аук всего теста, обучено на хорошем трейне 0.8668804271266028
рок аук хорошего теста, обучено на хорошем трейне 0.8698290294309472


In [1698]:
opt = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb1, cv=5, n_iter=25, scoring='roc_auc', random_state=8,n_jobs=1)
xx1=xx.iloc[:50000]
xx2=xx.iloc[50000:]
opt.fit(xx1.drop(columns='smoking'), xx1.smoking)
print('общий рок аук',round(opt.best_score_,7))
print(opt.best_estimator_)
opt.fit(xx2.drop(columns='smoking'), xx2.smoking)
print('общий рок аук',round(opt.best_score_,7))
print(opt.best_estimator_)

общий рок аук 0.8608489
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=1.0, colsample_bytree=1.0,
              device=None, early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.0, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=11, max_depth=8,
              max_leaves=None, min_child_weight=131, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=0, ...)
общий рок аук 0.8630007
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=0.2738578591347763,
              colsample_bytree=0.8540471450203599, device=None,
              early_st

In [1700]:
opt1 = BayesSearchCV(XGBClassifier(random_state=0,eval_metric='auc'),\
                    search_space_xgb1, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt2 = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb1, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)

opt3 = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt4 = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)


full_models=[]
good_models=[]
for opt in [opt1,opt2,opt3,opt4]:
    print('полный трейн:')
    opt.fit(xx.drop(columns='smoking'), xx.smoking)
    full_models.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    
    print('хороший трейн:')
    opt.fit(good_xx.drop(columns='smoking'), good_xx.smoking)
    good_models.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    print()

полный трейн:
общий рок аук 0.8623636
рок аук теста 0.8655541
рок аук хорошего теста 0.8671282
хороший трейн:
общий рок аук 0.8642734
рок аук теста 0.8659408
рок аук хорошего теста 0.867605

полный трейн:
общий рок аук 0.8652865
рок аук теста 0.867202
рок аук хорошего теста 0.8688572
хороший трейн:
общий рок аук 0.8677661
рок аук теста 0.8676239
рок аук хорошего теста 0.8692511

полный трейн:
общий рок аук 0.8602456
рок аук теста 0.864503
рок аук хорошего теста 0.8662397
хороший трейн:
общий рок аук 0.8616875
рок аук теста 0.8628235
рок аук хорошего теста 0.864543

полный трейн:
общий рок аук 0.8693319
рок аук теста 0.8709941
рок аук хорошего теста 0.8725685
хороший трейн:
общий рок аук 0.8695633
рок аук теста 0.8698871
рок аук хорошего теста 0.8715231



In [1714]:
scale_pos_weight=xx.query('smoking==0').shape[0]/xx.query('smoking==1').shape[0]
opt1 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb1, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt2 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb1, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)

opt3 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt4 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)


full_models1=[]
good_models1=[]
for opt in [opt1,opt2,opt3,opt4]:
    print('полный трейн:')
    opt.fit(xx.drop(columns='smoking'), xx.smoking)
    full_models1.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    
    print('хороший трейн:')
    opt.fit(good_xx.drop(columns='smoking'), good_xx.smoking)
    good_models1.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    print()

полный трейн:
общий рок аук 0.8629525
рок аук теста 0.8659088
рок аук хорошего теста 0.8676115
хороший трейн:
общий рок аук 0.8647741
рок аук теста 0.8664054
рок аук хорошего теста 0.8681845

полный трейн:
общий рок аук 0.8657205
рок аук теста 0.8685395
рок аук хорошего теста 0.8701226
хороший трейн:
общий рок аук 0.8677713
рок аук теста 0.8682315
рок аук хорошего теста 0.8699892

полный трейн:
общий рок аук 0.8608316
рок аук теста 0.864486
рок аук хорошего теста 0.8663537
хороший трейн:
общий рок аук 0.8629286
рок аук теста 0.8614467
рок аук хорошего теста 0.8631065

полный трейн:


KeyboardInterrupt: 

In [1766]:
# прибавка 0.005, отсечка 0.7
scale_pos_weight=xx.query('smoking==0').shape[0]/xx.query('smoking==1').shape[0]
opt1 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb1, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt2 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb1, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)

opt3 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb, cv=5, n_iter=5, scoring='roc_auc', random_state=8,n_jobs=1)

opt4 = BayesSearchCV(XGBClassifier(random_state=0,scale_pos_weight=scale_pos_weight),\
                    search_space_xgb, cv=10, n_iter=30, scoring='roc_auc', random_state=8,n_jobs=1)


full_models1=[]
good_models1=[]
for opt in [opt1,opt2,opt3,opt4]:
    print('полный трейн:')
    opt.fit(xx.drop(columns='smoking'), xx.smoking)
    full_models1.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    
    print('хороший трейн:')
    opt.fit(good_xx.drop(columns='smoking'), good_xx.smoking)
    good_models1.append(opt)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    
    pred=opt.predict_proba(good_xx_test.drop(columns='smoking'))[:,1]
    print('рок аук хорошего теста',round(roc_auc_score(good_xx_test.smoking,pred),7))
    print()

полный трейн:
общий рок аук 0.8629525
рок аук теста 0.8659088
рок аук хорошего теста 0.8674878
хороший трейн:
общий рок аук 0.8641359
рок аук теста 0.8662907
рок аук хорошего теста 0.8680006

полный трейн:
общий рок аук 0.8657205
рок аук теста 0.8685395
рок аук хорошего теста 0.8701657
хороший трейн:
общий рок аук 0.8681745
рок аук теста 0.8693701
рок аук хорошего теста 0.8710332

полный трейн:
общий рок аук 0.8608316
рок аук теста 0.864486
рок аук хорошего теста 0.8661669
хороший трейн:
общий рок аук 0.8621284
рок аук теста 0.8639362
рок аук хорошего теста 0.8655489

полный трейн:
общий рок аук 0.8696127
рок аук теста 0.8717823
рок аук хорошего теста 0.873387
хороший трейн:
общий рок аук 0.87073
рок аук теста 0.8711003
рок аук хорошего теста 0.8727299



In [1710]:
good_models[-1].best_estimator_.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': 0.5121939673724468,
 'colsample_bynode': 0.2,
 'colsample_bytree': 1.0,
 'device': None,
 'eval_metric': None,
 'gamma': 0.0,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': 0.018898917010970705,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': 30,
 'max_depth': 8,
 'max_leaves': None,
 'min_child_weight': 300,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 0,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.5,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [1687]:

for n_iter in range(1,30):
    print('!!!',n_iter,'!!!')
    opt = BayesSearchCV(XGBClassifier(random_state=0),\
                    search_space_xgb, cv=10, n_iter=n_iter, scoring='roc_auc', random_state=8,n_jobs=1)
    print('полный трейн:')
    opt.fit(xx.drop(columns='smoking'), xx.smoking)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    print(opt.best_estimator_)
    
    print('хороший трейн:')
    opt.fit(good_xx.drop(columns='smoking'), good_xx.smoking)
    print('общий рок аук',round(opt.best_score_,7))

    pred=opt.predict_proba(xx_test.drop(columns='smoking'))[:,1]
    print('рок аук теста',round(roc_auc_score(xx_test.smoking,pred),7))
    print(opt.best_estimator_)
    print()

!!! 1 !!!
полный трейн:
общий рок аук 0.8137558
рок аук теста 0.8186519
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.2529729028453782,
              colsample_bynode=0.5619713174317407,
              colsample_bytree=0.25792840290170194, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=22.309734412984962,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.0011002938624638172,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=17, max_depth=5, max_leaves=None,
              min_child_weight=60, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=638, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
хороший трейн:
общий рок аук 0.8156899
рок аук теста 0.8177629
XGBClassifier(ba

общий рок аук 0.8629499
рок аук теста 0.8628235
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.9080614696788323,
              colsample_bynode=0.8553530889983751,
              colsample_bytree=0.5298058869783279, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=4.979440623419699,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.7929828265552742,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=27, max_depth=6, max_leaves=None,
              min_child_weight=127, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=4993, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)

!!! 6 !!!
полный трейн:
общий рок аук 0.8609059
рок аук теста 0.864503
XGBClassifier(base_score=None, boo

общий рок аук 0.8639395
рок аук теста 0.8666773
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.5456071136054008,
              colsample_bynode=0.30157480628321565,
              colsample_bytree=0.8230640641908062, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.3852264188612543,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.006917774244564682,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=8, max_depth=8, max_leaves=None,
              min_child_weight=271, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=2607, n_jobs=None,
              num_parallel_tree=None, random_state=0, ...)
хороший трейн:
общий рок аук 0.8657176
рок аук теста 0.8665607
XGBClassifier(base_score=None, booster=N

общий рок аук 0.8693319
рок аук теста 0.8709941
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=1.0, colsample_bynode=0.2, colsample_bytree=1.0,
              device=None, early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.0, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.013031465885884932, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=22,
              max_depth=8, max_leaves=None, min_child_weight=209, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=5000,
              n_jobs=None, num_parallel_tree=None, random_state=0, ...)
хороший трейн:


KeyboardInterrupt: 

In [1049]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right','fromfull_auc_left','fromfull_auc_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,fromfull_auc_left,fromfull_auc_right
0,serum_creatinine,0.6,0.8575,0.84543,11221,116183,[serum_creatinine <= 0.6],[serum_creatinine > 0.6],0.849804,0.845434
1,serum_creatinine,0.8,0.92037,0.78472,51033,76371,[serum_creatinine <= 0.8],[serum_creatinine > 0.8],0.920368,0.78472
2,serum_creatinine,1.0,0.87916,0.73805,104612,22792,[serum_creatinine <= 1.0],[serum_creatinine > 1.0],0.879163,0.738046
3,age_sin,-0.9998,0.87608,0.85885,10788,116616,[age_sin <= -0.9998],[age_sin > -0.9998],0.876077,0.858847
4,age_sin,-0.3048,0.83714,0.87192,47657,79747,[age_sin <= -0.3048],[age_sin > -0.3048],0.837136,0.871918
5,age_sin,0.7451,0.85903,0.86606,104940,22464,[age_sin <= 0.7451],[age_sin > 0.7451],0.859032,0.866057
6,age_sin,0.8509,0.86482,0.65764,124357,3047,[age_sin <= 0.8509],[age_sin > 0.8509],0.864817,0.64868
7,Cholesterol,150.0,0.76923,0.86539,6722,120682,[Cholesterol <= 150.0],[Cholesterol > 150.0],0.769229,0.865393
8,Cholesterol,184.0,0.83582,0.87192,45111,82293,[Cholesterol <= 184.0],[Cholesterol > 184.0],0.835818,0.871925
9,Cholesterol,208.0,0.844,0.88734,83267,44137,[Cholesterol <= 208.0],[Cholesterol > 208.0],0.844003,0.887339


In [1050]:
to_csv(big_df,'big_df',new_version=True)

In [1009]:
qwe=final_trees.query('cut_max==20').merge(final_trees.query('cut_max==19'),on='metric',how='outer')
qwe=qwe[qwe.data_x.isna()]
qwe.iloc[0].data_y

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
0,[relaxation > 73.0],relaxation,73.0,0.87199,0.84954,46149.0,81255.0,[relaxation <= 73.0],[relaxation > 73.0],81255.0,0.84954,[relaxation <= 73.0],46149.0,0.87199
1,"[imt > 22.8928, relaxation > 73.0]",imt,22.8928,0.88059,0.82825,24528.0,56727.0,"[imt <= 22.8928, relaxation > 73.0]","[imt > 22.8928, relaxation > 73.0]",56727.0,0.82825,"[imt <= 22.8928, relaxation > 73.0]",24528.0,0.88059
2,"[imt > 22.8928, relaxation > 73.0]",imt,22.8928,0.88059,0.82825,24528.0,56727.0,"[imt <= 22.8928, relaxation > 73.0]","[imt > 22.8928, relaxation > 73.0]",56727.0,0.82825,"[imt > 22.8928, relaxation > 73.0]",56727.0,0.82825


In [995]:
final_trees.query('metric==0.85417')

Unnamed: 0,data,metric,cnt_leaves,cut_max,sumcolvalue


In [961]:
df=xx.copy()
filters=['age <= 25.0']
#filters=['ALT <= 55.0', 'AST <= 41.0', 'Cholesterol <= 184.0', 'Gtp <= 88.0', 'age <= 65.0', 'fasting_blood_sugar <= 100.0', 'height_cm > 150.0', 'hemoglobin <= 16.8', 'weight_kg <= 90.0']
x=datetime.datetime.now()
for f in filters:
    df=df.query(f)
y=datetime.datetime.now()
print(str(y-x))

#filters=pd.DataFrame([x.split(' ') for x in filters],columns=['col','sign','col_value'])
#filters.col_value=filters.col_value.astype(float)

df=xx.copy().head(1000)
x=datetime.datetime.now()
for f in filters:
    df=df.query(f)
y=datetime.datetime.now()
print(str(y-x))

0:00:00.009997
0:00:00.004997


In [920]:
filters=pd.DataFrame([x.split(' ') for x in filters],columns=['col','sign','col_value'])
filters.col_value=filters.col_value.astype(float)
filters

Unnamed: 0,col,sign,col_value
0,age,<=,25.0


In [982]:
def to_csv(df,name='',new_version=False):
    # если не задаём имя файла, то берётся название сохраняемого датафрейма
    if name=='':
        name='temp_'+datetime.datetime.now(tz).strftime("%Y-%m-%d %H-%M-%S")
    if new_version:
        if os.path.isfile(name+'.csv'):
            name=name+'_'+datetime.datetime.now(tz).strftime("%Y-%m-%d %H-%M-%S")
    df.to_csv(name+'.csv',sep=';',encoding='utf-8-sig', float_format="%.8f",decimal=',',index=False)
to_csv(big_df,'big_df',new_version=True)

In [840]:

 
with open("data.pickle", "wb") as file:
    pickle.dump(final_trees, file)
 
with open("data.pickle", "rb") as file:
    deserialized_data = pickle.load(file)



In [841]:
deserialized_data

Unnamed: 0,data,metric,cnt_leaves,cut_max
0,filters_split col col_val...,0.857417,2.0,20.0
1,filters_split col col_value ...,0.859384,2.0,20.0
2,filters_split col col_value auc_left ...,0.855760,2.0,20.0
3,filters_split ...,0.840001,3.0,20.0
4,filters_...,0.841365,3.0,20.0
...,...,...,...,...
58,filters_split col col_value ...,0.850219,3.0,20.0
59,filters_split co...,0.849538,3.0,20.0
60,filters...,0.835201,4.0,20.0
61,filters...,0.836483,4.0,20.0


In [861]:
final_trees.sort_values('metric',ascending=False).iloc[18].data

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
0,[Cholesterol <= 208.0],Cholesterol,208.0,0.844,0.88734,83267.0,44137.0,[Cholesterol <= 208.0],[Cholesterol > 208.0],83267.0,0.844,[Cholesterol > 208.0],44137.0,0.88734
1,"[Cholesterol <= 208.0, imt > 22.8928]",imt,22.8928,0.87258,0.81398,32887.0,50380.0,"[Cholesterol <= 208.0, imt <= 22.8928]","[Cholesterol <= 208.0, imt > 22.8928]",50380.0,0.81398,"[Cholesterol <= 208.0, imt <= 22.8928]",32887.0,0.87258
2,"[Cholesterol <= 208.0, imt > 22.8928]",imt,22.8928,0.87258,0.81398,32887.0,50380.0,"[Cholesterol <= 208.0, imt <= 22.8928]","[Cholesterol <= 208.0, imt > 22.8928]",50380.0,0.81398,"[Cholesterol <= 208.0, imt > 22.8928]",50380.0,0.81398


In [804]:
to_csv(big_df)

In [868]:
final_trees.sort_values(['cnt_leaves','metric'],ascending=[False,False]).iloc[0].data

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
0,[pressure_diff <= 1.6301],pressure_diff,1.6301,0.85506,0.86315,82922.0,44482.0,[pressure_diff <= 1.6301],[pressure_diff > 1.6301],82922.0,0.85506,[pressure_diff > 1.6301],44482.0,0.86315
1,"[Cholesterol <= 208.0, pressure_diff <= 1.6301]",Cholesterol,208.0,0.83575,0.87816,53363.0,29559.0,"[Cholesterol <= 208.0, pressure_diff <= 1.6301]","[Cholesterol > 208.0, pressure_diff <= 1.6301]",53363.0,0.83575,"[Cholesterol > 208.0, pressure_diff <= 1.6301]",29559.0,0.87816
2,"[Cholesterol <= 208.0, height_cm > 150.0, pres...",height_cm,150.0,0.5,0.8196,3144.0,50219.0,"[Cholesterol <= 208.0, height_cm <= 150.0, pre...","[Cholesterol <= 208.0, height_cm > 150.0, pres...",50219.0,0.8196,"[Cholesterol <= 208.0, height_cm <= 150.0, pre...",3144.0,0.5
3,"[Cholesterol <= 208.0, height_cm > 150.0, pres...",height_cm,150.0,0.5,0.8196,3144.0,50219.0,"[Cholesterol <= 208.0, height_cm <= 150.0, pre...","[Cholesterol <= 208.0, height_cm > 150.0, pres...",50219.0,0.8196,"[Cholesterol <= 208.0, height_cm > 150.0, pres...",50219.0,0.8196


In [761]:

opt.fit(xx.drop(columns='smoking'), xx.smoking)
best_clf=opt.best_estimator_
pred_train=best_clf.predict_proba(X_train)[:,1]
auc_train=roc_auc_score(y_train,pred_train)

pred_test=best_clf.predict_proba(X_test)[:,1]
auc_test=roc_auc_score(y_test,pred_test)

print(opt.best_score_,auc_train,auc_test)

test=pd.concat([X_test,y_test],axis=1)

leaves=final_trees.sort_values('metric',ascending=False).iloc[3]
preds=pd.DataFrame()
for i,row in leaves.data.iterrows():
    temp_train=apply_filters(xx,row.filters_leaf)
    opt.fit(temp_train.drop(columns='smoking'), temp_train.smoking)
    best_clf=opt.best_estimator_
    pred_train=best_clf.predict_proba(temp_train.drop(columns='smoking'))[:,1]
    auc_train=roc_auc_score(temp_train.smoking,pred_train)
    
    
    temp_test=apply_filters(test,row.filters_leaf)
    opt.fit(temp_test.drop(columns='smoking'), temp_test.smoking)
    best_clf=opt.best_estimator_
    pred_test=best_clf.predict_proba(temp_test.drop(columns='smoking'))[:,1]
    temp_test.loc[:,'pred']=pred_test
    preds=pd.concat([preds,temp_test.pred])
    auc_test=roc_auc_score(temp_test.smoking,pred_test)
    
    print(opt.best_score_,auc_train,auc_test,temp_train.shape[0],temp_test.shape[0])
    
print('final roc-auc test',roc_auc_score(y_test.sort_index(),preds.sort_index().loc[:,0]))

0.8522121736229529 0.858205859246622 0.8587349362750627


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_test['pred']=pred_test


0.8608855794244481 0.8730368477096438 0.8700194919059487 96411 24207
0.7597228045505692 0.7982351122538445 0.777051813214047 30993 7645
final roc-auc test 0.8510957351170675


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_test['pred']=pred_test


In [741]:
roc_auc_score(y_test.sort_index(),preds.sort_index().loc[:,0])

0.8523791618924703

In [740]:
y_test.sort_index()

1         0
10        0
13        1
19        1
20        1
         ..
159231    1
159238    0
159241    0
159250    0
159254    1
Name: smoking, Length: 31852, dtype: int64

In [730]:
y_test.sort_index()

1         0
10        0
13        1
19        1
20        1
         ..
159231    1
159238    0
159241    0
159250    0
159254    1
Name: smoking, Length: 31852, dtype: int64

In [772]:
to_csv(big_df)

In [705]:
test=pd.concat([X_test,y_test],axis=1)
test

Unnamed: 0,age,height(cm),weight(kg),waist(cm),systolic,relaxation,fasting_blood_sugar,Cholesterol,triglyceride,HDL,...,Gtp,dental_caries,imt,pressure_diff,eyesight_min,eyesight_max,hearing_min,hearing_max,age_sin,smoking
118031,45,155,45,65.0,170,110,112,228,105,100,...,19,0,18.7305,1.5455,1.0,1.2,1.0,1.0,0.8509,0
20900,30,170,70,74.0,116,72,89,194,146,56,...,97,1,24.2215,1.6111,0.9,1.0,1.0,1.0,-0.9880,1
50382,40,175,90,100.0,140,100,92,179,125,46,...,21,0,29.3878,1.4000,0.9,1.2,1.0,1.0,0.7451,1
75642,50,165,70,84.0,131,84,94,210,137,35,...,34,0,25.7117,1.5595,1.0,1.2,1.0,1.0,-0.2624,1
151809,25,170,70,76.0,118,68,106,211,79,98,...,24,0,24.2215,1.7353,1.2,1.5,1.0,1.0,-0.1324,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82274,35,175,75,89.9,118,80,100,202,196,36,...,19,0,24.4898,1.4750,0.8,1.0,1.0,1.0,-0.4282,1
110751,55,165,65,79.0,135,83,159,191,141,38,...,154,0,23.8751,1.6265,1.2,1.5,1.0,1.0,-0.9998,1
79072,65,155,50,77.0,110,70,102,184,147,58,...,18,0,20.8117,1.5714,0.4,0.8,1.0,1.0,0.8268,0
105430,40,150,45,64.0,99,67,83,200,75,73,...,12,1,20.0000,1.4776,0.5,0.9,1.0,1.0,0.7451,0


In [600]:
xxx.query('sign=="<="').sort_values('col_value',ascending=True).drop_duplicates(keep='first',subset=['col','sign'])

Unnamed: 0,col,sign,col_value
1,age_sin,<=,-0.9998
0,Cholesterol,<=,188.0


In [591]:
xxx=['Cholesterol <= 188.0', 'age_sin > -0.9998', 'age_sin > 0.7451', 'age_sin > 0.7451']
xxx=pd.DataFrame([x.split(' ') for x in xxx],columns=['col','sign','col_value'])
xxx

Unnamed: 0,col,sign,col_value
0,Cholesterol,<=,188.0
1,age_sin,>,-0.9998
2,age_sin,>,0.7451
3,age_sin,>,0.7451


In [601]:
new_xxx=pd.concat([xxx.query('sign==">"').sort_values('col_value',ascending=False).drop_duplicates(keep='first',subset=['col','sign']),\
           xxx.query('sign=="<="').sort_values('col_value',ascending=True).drop_duplicates(keep='first',subset=['col','sign'])])
new_xxx

Unnamed: 0,col,sign,col_value
1,age_sin,<=,-0.9998
0,Cholesterol,<=,188.0


In [602]:
svod=[]
for i,row in new_xxx.iterrows():
    s=f'{row.col} {row.sign} {row.col_value}'
    svod.append(s)
svod.sort()
svod

['Cholesterol <= 188.0', 'age_sin <= -0.9998']

In [574]:
xxx=final_trees[4]
sum(xxx.auc_leaf*xxx.cnt_leaf)/sum(xxx.cnt_leaf)

0.84749699835485

In [505]:
svod=create_svod(train,base_columns,quantiles,opt,opt_big)
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
actual_tree=step(temp,actual_tree)
print(len(final_trees))

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0
конец вечеринки
6


Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,cnt_split,auc_split,filters_leaf,cnt_leaf,auc_leaf
5,[age_sin<=0.7451],age_sin,0.7451,0.85042,0.85566,131079.0,28177.0,[age_sin<=0.7451],[age_sin>0.7451],131079.0,0.85042,[age_sin>0.7451],28177.0,0.85566
6,"[Cholesterol<=188.0, age_sin<=0.7451]",Cholesterol,188.0,0.82437,0.85988,52187.0,78892.0,"[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol>188.0, age_sin<=0.7451]",52187.0,0.82437,"[Cholesterol<=188.0, age_sin<=0.7451]",52187.0,0.82437
6,"[Cholesterol<=188.0, age_sin<=0.7451]",Cholesterol,188.0,0.82437,0.85988,52187.0,78892.0,"[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol>188.0, age_sin<=0.7451]",52187.0,0.82437,"[Cholesterol<=188.0, age_sin<=0.7451]",52187.0,0.82437


In [424]:
#big_df=pd.DataFrame(columns=['filters','auc'])
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=[f'{col}<={col_value}',]
        left=apply_filters(train,filters_left)
        
        filters_right=[f'{col}>{col_value}',]
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)

            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [425]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.85043,0.83686,14113,145143,[serum_creatinine<=0.6],[serum_creatinine>0.6]
1,serum_creatinine,0.9,0.8965,0.74037,99473,59783,[serum_creatinine<=0.9],[serum_creatinine>0.9]
2,serum_creatinine,1.0,0.87125,0.7265,130786,28470,[serum_creatinine<=1.0],[serum_creatinine>1.0]
3,age_sin,-0.9998,0.86366,0.85064,13448,145808,[age_sin<=-0.9998],[age_sin>-0.9998]
4,age_sin,-0.2624,0.84147,0.85966,75210,84046,[age_sin<=-0.2624],[age_sin>-0.2624]
5,age_sin,0.7451,0.85042,0.85566,131079,28177,[age_sin<=0.7451],[age_sin>0.7451]
6,Cholesterol,150.0,0.75815,0.85561,8399,150857,[Cholesterol<=150.0],[Cholesterol>150.0]
7,Cholesterol,188.0,0.82577,0.86595,64353,94903,[Cholesterol<=188.0],[Cholesterol>188.0]
8,Cholesterol,217.0,0.83804,0.88947,120489,38767,[Cholesterol<=217.0],[Cholesterol>217.0]


In [426]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf
7,Cholesterol,188.0,0.82577,0.86595,64353,94903,[Cholesterol<=188.0],[Cholesterol>188.0],[Cholesterol<=188.0],[Cholesterol>188.0],64353,94903,0.82577,0.86595
4,age_sin,-0.2624,0.84147,0.85966,75210,84046,[age_sin<=-0.2624],[age_sin>-0.2624],[age_sin<=-0.2624],[age_sin>-0.2624],75210,84046,0.84147,0.85966
8,Cholesterol,217.0,0.83804,0.88947,120489,38767,[Cholesterol<=217.0],[Cholesterol>217.0],[Cholesterol<=217.0],[Cholesterol>217.0],120489,38767,0.83804,0.88947
5,age_sin,0.7451,0.85042,0.85566,131079,28177,[age_sin<=0.7451],[age_sin>0.7451],[age_sin<=0.7451],[age_sin>0.7451],131079,28177,0.85042,0.85566


In [427]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf
7,Cholesterol,188.0,0.82577,0.86595,64353,94903,[Cholesterol<=188.0],[Cholesterol>188.0],[Cholesterol<=188.0],[Cholesterol>188.0],64353,94903,0.82577,0.86595
4,age_sin,-0.2624,0.84147,0.85966,75210,84046,[age_sin<=-0.2624],[age_sin>-0.2624],[age_sin<=-0.2624],[age_sin>-0.2624],75210,84046,0.84147,0.85966
8,Cholesterol,217.0,0.83804,0.88947,120489,38767,[Cholesterol<=217.0],[Cholesterol>217.0],[Cholesterol<=217.0],[Cholesterol>217.0],120489,38767,0.83804,0.88947
5,age_sin,0.7451,0.85042,0.85566,131079,28177,[age_sin<=0.7451],[age_sin>0.7451],[age_sin<=0.7451],[age_sin>0.7451],131079,28177,0.85042,0.85566


In [428]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)

            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [429]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.77441,0.80907,5554,58799,"[Cholesterol<=188.0, serum_creatinine<=0.6]","[Cholesterol<=188.0, serum_creatinine>0.6]"
1,serum_creatinine,0.9,0.86634,0.73072,40909,23444,"[Cholesterol<=188.0, serum_creatinine<=0.9]","[Cholesterol<=188.0, serum_creatinine>0.9]"
2,serum_creatinine,1.0,0.84207,0.70971,53754,10599,"[Cholesterol<=188.0, serum_creatinine<=1.0]","[Cholesterol<=188.0, serum_creatinine>1.0]"
3,age_sin,-0.9998,0.78416,0.82725,4444,59909,"[Cholesterol<=188.0, age_sin<=-0.9998]","[Cholesterol<=188.0, age_sin>-0.9998]"
4,age_sin,-0.2624,0.79188,0.8467,28211,36142,"[Cholesterol<=188.0, age_sin<=-0.2624]","[Cholesterol<=188.0, age_sin>-0.2624]"
5,age_sin,0.7451,0.82437,0.81813,52187,12166,"[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol<=188.0, age_sin>0.7451]"
6,Cholesterol,150.0,0.75815,0.83282,8399,55954,"[Cholesterol<=150.0, Cholesterol<=188.0]","[Cholesterol<=188.0, Cholesterol>150.0]"


In [430]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [431]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [432]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [433]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.79456,0.82475,6190,69020,"[age_sin<=-0.2624, serum_creatinine<=0.6]","[age_sin<=-0.2624, serum_creatinine>0.6]"
1,serum_creatinine,0.9,0.88643,0.72533,46974,28236,"[age_sin<=-0.2624, serum_creatinine<=0.9]","[age_sin<=-0.2624, serum_creatinine>0.9]"
2,serum_creatinine,1.0,0.8575,0.71267,61897,13313,"[age_sin<=-0.2624, serum_creatinine<=1.0]","[age_sin<=-0.2624, serum_creatinine>1.0]"
3,age_sin,-0.9998,0.86366,0.83164,13448,61762,"[age_sin<=-0.2624, age_sin<=-0.9998]","[age_sin<=-0.2624, age_sin>-0.9998]"
4,Cholesterol,150.0,0.71523,0.84527,3635,71575,"[Cholesterol<=150.0, age_sin<=-0.2624]","[Cholesterol>150.0, age_sin<=-0.2624]"
5,Cholesterol,188.0,0.79188,0.86125,28211,46999,"[Cholesterol<=188.0, age_sin<=-0.2624]","[Cholesterol>188.0, age_sin<=-0.2624]"
6,Cholesterol,217.0,0.81348,0.89062,55036,20174,"[Cholesterol<=217.0, age_sin<=-0.2624]","[Cholesterol>217.0, age_sin<=-0.2624]"


In [434]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf
5,Cholesterol,188.0,0.79188,0.86125,28211,46999,"[Cholesterol<=188.0, age_sin<=-0.2624]","[Cholesterol>188.0, age_sin<=-0.2624]","[Cholesterol<=188.0, age_sin<=-0.2624]","[Cholesterol>188.0, age_sin<=-0.2624]",28211,46999,0.79188,0.86125
6,Cholesterol,217.0,0.81348,0.89062,55036,20174,"[Cholesterol<=217.0, age_sin<=-0.2624]","[Cholesterol>217.0, age_sin<=-0.2624]","[Cholesterol<=217.0, age_sin<=-0.2624]","[Cholesterol>217.0, age_sin<=-0.2624]",55036,20174,0.81348,0.89062


In [435]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [437]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [438]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.82437,0.77755,1734,26477,"[Cholesterol<=188.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=188.0, age_sin<=-0.2624, age_sin..."
1,serum_creatinine,0.9,0.82845,0.71234,16981,11230,"[Cholesterol<=188.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=188.0, age_sin<=-0.2624, age_sin..."
2,serum_creatinine,1.0,0.8049,0.6877,23122,5089,"[Cholesterol<=188.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=188.0, age_sin<=-0.2624, age_sin..."
3,age_sin,-0.9998,0.78416,0.78868,4444,23767,"[Cholesterol<=188.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=188.0, age_sin<=-0.2624, age_sin..."
4,Cholesterol,150.0,0.71523,0.79658,3635,24576,"[Cholesterol<=150.0, Cholesterol<=188.0, age_s...","[Cholesterol<=188.0, Cholesterol>150.0, age_si..."


In [439]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [440]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [441]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [442]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.86938,0.79649,3745,51291,"[Cholesterol<=217.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=217.0, age_sin<=-0.2624, age_sin..."
1,serum_creatinine,0.9,0.85344,0.72001,32970,22066,"[Cholesterol<=217.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=217.0, age_sin<=-0.2624, age_sin..."
2,serum_creatinine,1.0,0.82602,0.70957,44772,10264,"[Cholesterol<=217.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=217.0, age_sin<=-0.2624, age_sin..."
3,age_sin,-0.9998,0.82097,0.80378,9064,45972,"[Cholesterol<=217.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=217.0, age_sin<=-0.2624, age_sin..."
4,Cholesterol,150.0,0.71523,0.81488,3635,51401,"[Cholesterol<=150.0, Cholesterol<=217.0, age_s...","[Cholesterol<=217.0, Cholesterol>150.0, age_si..."
5,Cholesterol,188.0,0.79188,0.82198,28211,26825,"[Cholesterol<=188.0, Cholesterol<=217.0, age_s...","[Cholesterol<=217.0, Cholesterol>188.0, age_si..."


In [443]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [444]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [445]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            print('пусто слева или справа')
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0
пусто слева или справа


In [446]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.82822,0.81942,9977,110512,"[Cholesterol<=217.0, serum_creatinine<=0.6]","[Cholesterol<=217.0, serum_creatinine>0.6]"
1,serum_creatinine,0.9,0.87861,0.73431,74272,46217,"[Cholesterol<=217.0, serum_creatinine<=0.9]","[Cholesterol<=217.0, serum_creatinine>0.9]"
2,serum_creatinine,1.0,0.85492,0.72328,98915,21574,"[Cholesterol<=217.0, serum_creatinine<=1.0]","[Cholesterol<=217.0, serum_creatinine>1.0]"
3,age_sin,-0.9998,0.82097,0.83693,9064,111425,"[Cholesterol<=217.0, age_sin<=-0.9998]","[Cholesterol<=217.0, age_sin>-0.9998]"
4,age_sin,-0.2624,0.81348,0.85177,55036,65453,"[Cholesterol<=217.0, age_sin<=-0.2624]","[Cholesterol<=217.0, age_sin>-0.2624]"
5,age_sin,0.7451,0.83403,0.83957,98696,21793,"[Cholesterol<=217.0, age_sin<=0.7451]","[Cholesterol<=217.0, age_sin>0.7451]"
6,Cholesterol,150.0,0.75815,0.84108,8399,112090,"[Cholesterol<=150.0, Cholesterol<=217.0]","[Cholesterol<=217.0, Cholesterol>150.0]"
7,Cholesterol,188.0,0.82577,0.84134,64353,56136,"[Cholesterol<=188.0, Cholesterol<=217.0]","[Cholesterol<=217.0, Cholesterol>188.0]"


In [447]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [448]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [451]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            print('пусто слева или справа')
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
пусто слева или справа
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0


In [452]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.84218,0.83278,11344,119735,"[age_sin<=0.7451, serum_creatinine<=0.6]","[age_sin<=0.7451, serum_creatinine>0.6]"
1,serum_creatinine,0.9,0.89248,0.7377,81545,49534,"[age_sin<=0.7451, serum_creatinine<=0.9]","[age_sin<=0.7451, serum_creatinine>0.9]"
2,serum_creatinine,1.0,0.86728,0.72528,107519,23560,"[age_sin<=0.7451, serum_creatinine<=1.0]","[age_sin<=0.7451, serum_creatinine>1.0]"
3,age_sin,-0.9998,0.86366,0.84533,13448,117631,"[age_sin<=-0.9998, age_sin<=0.7451]","[age_sin<=0.7451, age_sin>-0.9998]"
4,age_sin,-0.2624,0.84147,0.85307,75210,55869,"[age_sin<=-0.2624, age_sin<=0.7451]","[age_sin<=0.7451, age_sin>-0.2624]"
5,Cholesterol,150.0,0.75614,0.85264,6485,124594,"[Cholesterol<=150.0, age_sin<=0.7451]","[Cholesterol>150.0, age_sin<=0.7451]"
6,Cholesterol,188.0,0.82437,0.85988,52187,78892,"[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol>188.0, age_sin<=0.7451]"
7,Cholesterol,217.0,0.83403,0.88331,98696,32383,"[Cholesterol<=217.0, age_sin<=0.7451]","[Cholesterol>217.0, age_sin<=0.7451]"


In [453]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf
6,Cholesterol,188.0,0.82437,0.85988,52187,78892,"[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol>188.0, age_sin<=0.7451]","[Cholesterol<=188.0, age_sin<=0.7451]","[Cholesterol>188.0, age_sin<=0.7451]",52187,78892,0.82437,0.85988
7,Cholesterol,217.0,0.83403,0.88331,98696,32383,"[Cholesterol<=217.0, age_sin<=0.7451]","[Cholesterol>217.0, age_sin<=0.7451]","[Cholesterol<=217.0, age_sin<=0.7451]","[Cholesterol>217.0, age_sin<=0.7451]",98696,32383,0.83403,0.88331


In [454]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [456]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            print('пусто слева или справа')
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
пусто слева или справа
Cholesterol 150.0
Cholesterol 188.0
пусто слева или справа
Cholesterol 217.0
пусто слева или справа


In [457]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.64547,0.80533,4442,47745,"[Cholesterol<=188.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=188.0, age_sin<=0.7451, age_sin<..."
1,serum_creatinine,0.9,0.86227,0.7253,33315,18872,"[Cholesterol<=188.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=188.0, age_sin<=0.7451, age_sin<..."
2,serum_creatinine,1.0,0.83836,0.70483,43659,8528,"[Cholesterol<=188.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=188.0, age_sin<=0.7451, age_sin<..."
3,age_sin,-0.9998,0.78416,0.82604,4444,47743,"[Cholesterol<=188.0, age_sin<=-0.9998, age_sin...","[Cholesterol<=188.0, age_sin<=0.7451, age_sin<..."
4,age_sin,-0.2624,0.79188,0.85343,28211,23976,"[Cholesterol<=188.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=188.0, age_sin<=0.7451, age_sin<..."
5,Cholesterol,150.0,0.75614,0.82663,6485,45702,"[Cholesterol<=150.0, Cholesterol<=188.0, age_s...","[Cholesterol<=188.0, Cholesterol>150.0, age_si..."


In [459]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [460]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

In [461]:
svod=[]
for col in base_columns:
    for col_value in quantiles[col]:
        print(col,col_value)
        filters_left=sorted(actual_tree.filters_split.sum()+[f'{col}<={col_value}',])
        left=apply_filters(train,filters_left)
        
        filters_right=sorted(actual_tree.filters_split.sum()+[f'{col}>{col_value}',])
        right=apply_filters(train,filters_right)
        
        if left.empty or right.empty:
            print('пусто слева или справа')
            continue
        
        try:
            auc_left=big_df.loc[big_df['filters'].apply(lambda x:x==filters_left),'auc'].drop_duplicates().item()
        except:
            opt.fit(left.drop(columns='smoking'), left.smoking)
            auc_left=round(opt.best_score_,5)
            
            if auc_left==0.5:
                opt_big.fit(left.drop(columns='smoking'), left.smoking)
                auc_left=round(opt_big.best_score_,5)
            temp = {'filters': filters_left, 'auc': auc_left}
            big_df = big_df.append(temp, ignore_index = True)

        try:
            auc_right=big_df.loc[big_df['filters'].apply(lambda x:x==filters_right),'auc'].drop_duplicates().item()
        except:
            opt.fit(right.drop(columns='smoking'), right.smoking)
            auc_right=round(opt.best_score_,5)
            
            if auc_right==0.5:
                opt_big.fit(right.drop(columns='smoking'), right.smoking)
                auc_right=round(opt_big.best_score_,5)

            temp = {'filters': filters_right, 'auc': auc_right}
            big_df = big_df.append(temp, ignore_index = True)

        

        svod.append([col,col_value,auc_left,auc_right,left.shape[0],right.shape[0],filters_left,filters_right])
        #except Exception as e:
        #    print('!!! '+str(e)+' !!!')

serum_creatinine 0.6
serum_creatinine 0.9
serum_creatinine 1.0
age_sin -0.9998
age_sin -0.2624
age_sin 0.7451
пусто слева или справа
Cholesterol 150.0
Cholesterol 188.0
Cholesterol 217.0
пусто слева или справа


In [462]:
temp=pd.DataFrame(svod)
temp.columns=['col','col_value','auc_left','auc_right','cnt_left','cnt_right','filters_left','filters_right']
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right
0,serum_creatinine,0.6,0.82501,0.81574,7942,90754,"[Cholesterol<=217.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=217.0, age_sin<=0.7451, age_sin<..."
1,serum_creatinine,0.9,0.8743,0.72862,60703,37993,"[Cholesterol<=217.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=217.0, age_sin<=0.7451, age_sin<..."
2,serum_creatinine,1.0,0.84928,0.7229,80980,17716,"[Cholesterol<=217.0, age_sin<=0.7451, age_sin<...","[Cholesterol<=217.0, age_sin<=0.7451, age_sin<..."
3,age_sin,-0.9998,0.82097,0.83268,9064,89632,"[Cholesterol<=217.0, age_sin<=-0.9998, age_sin...","[Cholesterol<=217.0, age_sin<=0.7451, age_sin<..."
4,age_sin,-0.2624,0.81348,0.85154,55036,43660,"[Cholesterol<=217.0, age_sin<=-0.2624, age_sin...","[Cholesterol<=217.0, age_sin<=0.7451, age_sin<..."
5,Cholesterol,150.0,0.75614,0.83538,6485,92211,"[Cholesterol<=150.0, Cholesterol<=217.0, age_s...","[Cholesterol<=217.0, Cholesterol>150.0, age_si..."
6,Cholesterol,188.0,0.82437,0.83299,52187,46509,"[Cholesterol<=188.0, Cholesterol<=217.0, age_s...","[Cholesterol<=217.0, Cholesterol>188.0, age_si..."


In [463]:
temp=leaf_and_split(svod,min_border,max_border,min_leaf,base_auc)
temp

Unnamed: 0,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_split,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf


In [464]:
if temp.empty:
    final_trees.append(actual_tree)
    
    
    while True:
        if work_dfs==[]:
            break
        actual_tree=actual_tree[:-1]
        work_dfs[-1]=work_dfs[-1][1:]
        if not work_dfs[-1].empty:
            break
        else:
            work_dfs=work_dfs[:-1]
    if work_dfs==[]:
        print('конец вечеринки')
    else:
        actual_tree=pd.concat([actual_tree,work_dfs[-1].head(1)])
else:
    work_dfs.append(temp)
    actual_tree=pd.concat([actual_tree,temp.head(1)])

конец вечеринки


In [465]:
len(final_trees)

6

In [466]:
xxx=final_trees[4]
to_csv(xxx)

In [469]:
for i in range(len(final_trees)):
    to_csv(final_trees[i],f'tree{i}')

In [197]:
from my_functions import *
to_csv(actual_tree)

In [511]:
final_trees[5]

Unnamed: 0,filters_split,col,col_value,auc_left,auc_right,cnt_left,cnt_right,filters_left,filters_right,filters_leaf,cnt_split,cnt_leaf,auc_split,auc_leaf
5,[age_sin<=0.7451],age_sin,0.7451,0.85042,0.85566,131079.0,28177.0,[age_sin<=0.7451],[age_sin>0.7451],[age_sin>0.7451],131079.0,28177.0,0.85042,0.85566
7,"[Cholesterol<=217.0, age_sin<=0.7451]",Cholesterol,217.0,0.83403,0.88331,98696.0,32383.0,"[Cholesterol<=217.0, age_sin<=0.7451]","[Cholesterol>217.0, age_sin<=0.7451]","[Cholesterol>217.0, age_sin<=0.7451]",98696.0,32383.0,0.83403,0.88331


In [512]:
to_csv(big_df)