In [1]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import pandas as pd
# 모든 행을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 기본값으로 설정 (처음 5개와 마지막 5개 행만 출력)
pd.reset_option('display.max_rows')

# 출력 포맷 설정 (소수점 4자리까지)
pd.options.display.float_format = '{:.4f}'.format

import platform
import seaborn as sns

import matplotlib.pyplot as plt

# 운영 체제 확인
if platform.system() == 'Darwin':  # Mac
    print('apple gothic')
    font_name = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    font_name = 'NanumGothic'
else:
    font_name = None

# 한글 폰트 설정
if font_name:
    plt.rcParams['font.family'] = font_name

# 마이너스 부호 설정
plt.rcParams['axes.unicode_minus'] = False

apple gothic


## 데이터 로드

In [2]:
X_train = pd.read_csv('../data/dataset_final/X_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
X_test = pd.read_csv('../data/dataset_final/X_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

y_train = pd.read_csv('../data/dataset_final/y_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
y_test = pd.read_csv('../data/dataset_final/y_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1187, 10), (198, 10), (1187, 1), (198, 1))

In [3]:
X_train.head(2)

Unnamed: 0_level_0,순운전자본대비총자본,비유동장기적합률,자기자본순이익률,당좌비율,총자본정상영업이익률,자본금회전율,총자산회전율,배당금지급(-),미처분이익잉여금(결손금),영업손익
거래소코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40,-0.1204,-0.2665,0.1511,-0.3273,-0.1546,-0.6327,0.0008,-0.2343,-0.1,0.0551
50,-0.86,0.3045,0.4107,-0.3775,0.7126,0.5597,-1.1884,-0.2343,-0.4147,0.3254


## 파라미터 조절 모델링 (f1 기준)

In [4]:
from modules import modeling as ml

In [5]:
res_best = {}

### (1) LR

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
# 그리드 서치 결과
# Best Parameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}
# Best Estimator: LogisticRegression(C=0.001, random_state=42, solver='liblinear')

res_best['lr'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='lr', 
        model=LogisticRegression(C=0.001, solver='liblinear', random_state=42)
)

res_best['lr']

{'name': 'lr',
 'model': LogisticRegression(C=0.001, random_state=42, solver='liblinear'),
 'results': {'cf_matrix': array([[114,  10],
         [ 55,  19]]),
  'accuracy': 0.6717171717171717,
  'precision': 0.6551724137931034,
  'recall': 0.25675675675675674,
  'f1': 0.36893203883495146,
  'roc_auc': 0.7307105492589364},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0  [[131, 25], [60, 22]]    0.6429     0.4681  0.2683 0.3411   0.6345
 1  [[126, 30], [51, 31]]    0.6597     0.5082  0.3780 0.4336   0.6594
 2  [[143, 13], [62, 19]]    0.6835     0.5938  0.2346 0.3363   0.7065
 3  [[136, 19], [59, 23]]    0.6709     0.5476  0.2805 0.3710   0.6489
 4  [[137, 18], [61, 21]]    0.6667     0.5385  0.2561 0.3471   0.6857}

### (2) DT

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
DecisionTreeClassifier(random_state=42).get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [10]:
# 그리드 서치 결과
# Best Parameters: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 5}
# Best Estimator: DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_split=5, random_state=42)

res_best['dt'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='dt', 
        model=DecisionTreeClassifier(
            criterion='entropy', max_depth=11, min_samples_split=5, 
            random_state=42
        )
)

res_best['dt']

{'name': 'dt',
 'model': DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_split=5,
                        random_state=42),
 'results': {'cf_matrix': array([[98, 26],
         [49, 25]]),
  'accuracy': 0.6212121212121212,
  'precision': 0.49019607843137253,
  'recall': 0.33783783783783783,
  'f1': 0.4,
  'roc_auc': 0.6281604184829992},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0  [[104, 52], [39, 43]]    0.6176     0.4526  0.5244 0.4859   0.6221
 1  [[100, 56], [46, 36]]    0.5714     0.3913  0.4390 0.4138   0.5568
 2  [[108, 48], [41, 40]]    0.6245     0.4545  0.4938 0.4734   0.5789
 3  [[111, 44], [49, 33]]    0.6076     0.4286  0.4024 0.4151   0.5738
 4  [[116, 39], [45, 37]]    0.6456     0.4868  0.4512 0.4684   0.5875}

### (3) SVM

In [11]:
from sklearn.svm import SVC

In [12]:
SVC(probability=True, random_state=42).get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [13]:
# 그리드 서치 결과
# Best Parameters: {'C': 10, 'kernel': 'sigmoid'}
# Best Estimator: SVC(C=10, kernel='sigmoid', probability=True, random_state=42)

res_best['svm'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='svm', 
        model=SVC(C=10, kernel='sigmoid', probability=True, random_state=42)
)

res_best['svm']

{'name': 'svm',
 'model': SVC(C=10, kernel='sigmoid', probability=True, random_state=42),
 'results': {'cf_matrix': array([[81, 43],
         [39, 35]]),
  'accuracy': 0.5858585858585859,
  'precision': 0.44871794871794873,
  'recall': 0.47297297297297297,
  'f1': 0.46052631578947373,
  'roc_auc': 0.6012423714036619},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0   [[93, 63], [35, 47]]    0.5882     0.4273  0.5732 0.4896   0.5722
 1  [[108, 48], [43, 39]]    0.6176     0.4483  0.4756 0.4615   0.6010
 2  [[128, 28], [48, 33]]    0.6793     0.5410  0.4074 0.4648   0.6505
 3  [[104, 51], [44, 38]]    0.5992     0.4270  0.4634 0.4444   0.5946
 4  [[131, 24], [55, 27]]    0.6667     0.5294  0.3293 0.4060   0.6943}

In [14]:
y_train.value_counts()

label 
0.0000    778
1.0000    409
Name: count, dtype: int64

In [15]:
X_train.shape, X_test.shape

((1187, 10), (198, 10))

In [16]:
1187 + 198

1385

In [17]:
y_test.value_counts()

label 
0.0000    124
1.0000     74
Name: count, dtype: int64

### (4) RF

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
RandomForestClassifier(random_state=42).get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [20]:
# 그리드 서치 결과
# Best Parameters: {'criterion': 'gini', 'max_depth': 9, 'min_impurity_decrease': 0.001, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
# Best Estimator: RandomForestClassifier(max_depth=9, min_impurity_decrease=0.001, n_estimators=10, random_state=42)

res_best['rf'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name = 'rf', 
        model = RandomForestClassifier(max_depth=9, min_impurity_decrease=0.001, n_estimators=10, random_state=42)
)

res_best['rf']

{'name': 'rf',
 'model': RandomForestClassifier(max_depth=9, min_impurity_decrease=0.001,
                        n_estimators=10, random_state=42),
 'results': {'cf_matrix': array([[106,  18],
         [ 45,  29]]),
  'accuracy': 0.6818181818181818,
  'precision': 0.6170212765957447,
  'recall': 0.3918918918918919,
  'f1': 0.4793388429752067,
  'roc_auc': 0.6912053182214473},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0  [[127, 29], [45, 37]]    0.6891     0.5606  0.4512 0.5000   0.6277
 1  [[125, 31], [59, 23]]    0.6218     0.4259  0.2805 0.3382   0.6246
 2  [[138, 18], [60, 21]]    0.6709     0.5385  0.2593 0.3500   0.6168
 3  [[122, 33], [52, 30]]    0.6414     0.4762  0.3659 0.4138   0.6279
 4  [[137, 18], [57, 25]]    0.6835     0.5814  0.3049 0.4000   0.6403}

### (5) XGB

In [21]:
from xgboost import XGBClassifier

In [22]:
XGBClassifier(random_state=42).get_params().keys()

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'feature_types', 'gamma', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_threshold', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'multi_strategy', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [23]:
XGBClassifier(random_state=42).get_params()['learning_rate']

In [24]:
# 그리드 서치 결과
# Best Parameters: {'learning_rate': 0.1, 'max_depth': 11, 'min_child_weight': 7, 'min_impurity_decrease': 0.001, 'n_estimators': 300}

# Best Estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
#               colsample_bylevel=None, colsample_bynode=None,
#               colsample_bytree=None, device=None, early_stopping_rounds=None,
#               enable_categorical=False, eval_metric=None, feature_types=None,
#               gamma=None, grow_policy=None, importance_type=None,
#               interaction_constraints=None, learning_rate=0.1, max_bin=None,
#               max_cat_threshold=None, max_cat_to_onehot=None,
#               max_delta_step=None, max_depth=11, max_leaves=None,
#               min_child_weight=7, min_impurity_decrease=0.001, missing=nan,
#               monotone_constraints=None, multi_strategy=None, n_estimators=300,
#               n_jobs=None, num_parallel_tree=None, ...)

res_best['xgb'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='xgb', 
        model=XGBClassifier(
            learning_rate = 0.1, max_depth = 11, min_child_weight=7, min_impurity_decrease = 0.001, n_estimators=300,
            random_state=42
        )
)
res_best['xgb']

{'name': 'xgb',
 'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=11, max_leaves=None,
               min_child_weight=7, min_impurity_decrease=0.001, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=300,
               n_jobs=None, num_parallel_tree=None, ...),
 'results': {'cf_matrix': array([[96, 28],
         [48, 26]]),
  'accuracy': 0.6161616161616161,
  'precision': 0.48148148148148145,
  'recall': 0.35135135135135137,
  'f1': 0.40625,
  'roc_auc': 0.623

### (6) LGBM

In [25]:
from lightgbm import LGBMClassifier

In [26]:
LGBMClassifier(random_state=42).get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [27]:
# 그리드 서치 결과
# Best Parameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 0.001}
# Best Estimator: LGBMClassifier(max_depth=9, n_estimators=50, random_state=42, reg_alpha=1, reg_lambda=0.001)

res_best['lgbm'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='lgbm', 
        model=LGBMClassifier(
            max_depth=9, n_estimators=50, reg_alpha=1, reg_lambda=0.001,
            random_state=42
        )
)
res_best['lgbm']

[LightGBM] [Info] Number of positive: 409, number of negative: 778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1187, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344566 -> initscore=-0.643011
[LightGBM] [Info] Start training from score -0.643011
[LightGBM] [Info] Number of positive: 327, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344573 -> initscore=-0.642980
[LightGBM] [Info] Start training from score -0.642980
[LightGBM] [Info] Numbe

{'name': 'lgbm',
 'model': LGBMClassifier(max_depth=9, n_estimators=50, random_state=42, reg_alpha=1,
                reg_lambda=0.001),
 'results': {'cf_matrix': array([[105,  19],
         [ 49,  25]]),
  'accuracy': 0.6565656565656566,
  'precision': 0.5681818181818182,
  'recall': 0.33783783783783783,
  'f1': 0.423728813559322,
  'roc_auc': 0.6592197035745424},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0  [[119, 37], [44, 38]]    0.6597     0.5067  0.4634 0.4841   0.6295
 1  [[122, 34], [55, 27]]    0.6261     0.4426  0.3293 0.3776   0.6251
 2  [[131, 25], [56, 25]]    0.6582     0.5000  0.3086 0.3817   0.6103
 3  [[124, 31], [46, 36]]    0.6751     0.5373  0.4390 0.4832   0.6270
 4  [[142, 13], [61, 21]]    0.6878     0.6176  0.2561 0.3621   0.6958}

### (7) catboost

In [28]:
from catboost import CatBoostClassifier

In [29]:
CatBoostClassifier(random_state=42).get_params()

{'random_state': 42}

In [30]:
# 그리드 서치 결과
# Best Parameters: {'depth': 5, 'iterations': 200, 'l2_leaf_reg': 0.001, 'learning_rate': 0.1, 'loss_function': 'Logloss'}

res_best['cat'] = ml.train(
        X_train, y_train, X_test, y_test, 
        model_name='cat', 
        model=CatBoostClassifier(
            depth=5, iterations=200, l2_leaf_reg=0.001, learning_rate=0.1, loss_function='Logloss',
            random_state=42
        )
)

res_best['cat']

0:	learn: 0.6699398	total: 56.4ms	remaining: 11.2s
1:	learn: 0.6555007	total: 57.6ms	remaining: 5.7s
2:	learn: 0.6449111	total: 58.3ms	remaining: 3.83s
3:	learn: 0.6260252	total: 59.2ms	remaining: 2.9s
4:	learn: 0.6134480	total: 60.2ms	remaining: 2.35s
5:	learn: 0.6050270	total: 61ms	remaining: 1.97s
6:	learn: 0.5976189	total: 61.8ms	remaining: 1.7s
7:	learn: 0.5917797	total: 62.7ms	remaining: 1.5s
8:	learn: 0.5878755	total: 63.5ms	remaining: 1.35s
9:	learn: 0.5845806	total: 64.2ms	remaining: 1.22s
10:	learn: 0.5813387	total: 65.4ms	remaining: 1.12s
11:	learn: 0.5768688	total: 67.2ms	remaining: 1.05s
12:	learn: 0.5714617	total: 70.3ms	remaining: 1.01s
13:	learn: 0.5655598	total: 73.3ms	remaining: 973ms
14:	learn: 0.5607042	total: 76.4ms	remaining: 942ms
15:	learn: 0.5555978	total: 77.7ms	remaining: 894ms
16:	learn: 0.5506792	total: 81.1ms	remaining: 873ms
17:	learn: 0.5473529	total: 82.8ms	remaining: 837ms
18:	learn: 0.5445639	total: 85.2ms	remaining: 811ms
19:	learn: 0.5417335	total: 

{'name': 'cat',
 'model': <catboost.core.CatBoostClassifier at 0x17f87e3f0>,
 'results': {'cf_matrix': array([[107,  17],
         [ 49,  25]]),
  'accuracy': 0.6666666666666666,
  'precision': 0.5952380952380952,
  'recall': 0.33783783783783783,
  'f1': 0.43103448275862066,
  'roc_auc': 0.6443984306887532},
 'cv':                cf_matrix  accuracy  precision  recall     f1  roc_auc
 0  [[112, 44], [49, 33]]    0.6092     0.4286  0.4024 0.4151   0.5831
 1  [[113, 43], [53, 29]]    0.5966     0.4028  0.3537 0.3766   0.5871
 2  [[125, 31], [55, 26]]    0.6371     0.4561  0.3210 0.3768   0.6252
 3  [[122, 33], [48, 34]]    0.6582     0.5075  0.4146 0.4564   0.6473
 4  [[136, 19], [63, 19]]    0.6540     0.5000  0.2317 0.3167   0.7011}

## 튜닝 결과 확인

In [31]:
methods = ['lr', 'svm', 'dt', 'rf', 'lgbm', 'xgb', 'cat']

all_trainCV = []
all_test = []

for key in methods:
    print(key)

    trainCV = res_best[key]['cv']
    trainCV = trainCV.drop(columns=['cf_matrix']).mean(axis=0)
    trainCV['model'] = key

    test = res_best[key]['results']
    test['model'] = key
    
    all_trainCV.append(trainCV)
    all_test.append(test)

all_trainCV = pd.DataFrame(all_trainCV).set_index('model')
all_test = pd.DataFrame(all_test).set_index('model')

lr
svm
dt
rf
lgbm
xgb
cat


In [32]:
all_trainCV

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,0.6647,0.5312,0.2835,0.3658,0.667
svm,0.6302,0.4746,0.4498,0.4533,0.6225
dt,0.6133,0.4428,0.4622,0.4513,0.5838
rf,0.6613,0.5165,0.3323,0.4004,0.6275
lgbm,0.6614,0.5209,0.3593,0.4177,0.6375
xgb,0.6471,0.4985,0.3912,0.431,0.6239
cat,0.631,0.459,0.3447,0.3883,0.6288


In [33]:
all_trainCV.sort_values('f1', ascending=False)

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
svm,0.6302,0.4746,0.4498,0.4533,0.6225
dt,0.6133,0.4428,0.4622,0.4513,0.5838
xgb,0.6471,0.4985,0.3912,0.431,0.6239
lgbm,0.6614,0.5209,0.3593,0.4177,0.6375
rf,0.6613,0.5165,0.3323,0.4004,0.6275
cat,0.631,0.459,0.3447,0.3883,0.6288
lr,0.6647,0.5312,0.2835,0.3658,0.667


In [43]:
all_trainCV.sort_values('precision', ascending=False)

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,0.6647,0.5312,0.2835,0.3658,0.667
lgbm,0.6614,0.5209,0.3593,0.4177,0.6375
rf,0.6613,0.5165,0.3323,0.4004,0.6275
xgb,0.6471,0.4985,0.3912,0.431,0.6239
svm,0.6302,0.4746,0.4498,0.4533,0.6225
cat,0.631,0.459,0.3447,0.3883,0.6288
dt,0.6133,0.4428,0.4622,0.4513,0.5838


In [34]:
all_test

Unnamed: 0_level_0,cf_matrix,accuracy,precision,recall,f1,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,"[[114, 10], [55, 19]]",0.6717,0.6552,0.2568,0.3689,0.7307
svm,"[[81, 43], [39, 35]]",0.5859,0.4487,0.473,0.4605,0.6012
dt,"[[98, 26], [49, 25]]",0.6212,0.4902,0.3378,0.4,0.6282
rf,"[[106, 18], [45, 29]]",0.6818,0.617,0.3919,0.4793,0.6912
lgbm,"[[105, 19], [49, 25]]",0.6566,0.5682,0.3378,0.4237,0.6592
xgb,"[[96, 28], [48, 26]]",0.6162,0.4815,0.3514,0.4062,0.6234
cat,"[[107, 17], [49, 25]]",0.6667,0.5952,0.3378,0.431,0.6444


In [35]:
all_test.sort_values('precision', ascending=False)

Unnamed: 0_level_0,cf_matrix,accuracy,precision,recall,f1,roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,"[[114, 10], [55, 19]]",0.6717,0.6552,0.2568,0.3689,0.7307
rf,"[[106, 18], [45, 29]]",0.6818,0.617,0.3919,0.4793,0.6912
cat,"[[107, 17], [49, 25]]",0.6667,0.5952,0.3378,0.431,0.6444
lgbm,"[[105, 19], [49, 25]]",0.6566,0.5682,0.3378,0.4237,0.6592
dt,"[[98, 26], [49, 25]]",0.6212,0.4902,0.3378,0.4,0.6282
xgb,"[[96, 28], [48, 26]]",0.6162,0.4815,0.3514,0.4062,0.6234
svm,"[[81, 43], [39, 35]]",0.5859,0.4487,0.473,0.4605,0.6012


In [36]:
pd.merge(
    all_trainCV, all_test.drop(columns=['cf_matrix']),
    left_index=True, right_index=True,
    how='inner', suffixes=('_train', '_test')
)

Unnamed: 0_level_0,accuracy_train,precision_train,recall_train,f1_train,roc_auc_train,accuracy_test,precision_test,recall_test,f1_test,roc_auc_test
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lr,0.6647,0.5312,0.2835,0.3658,0.667,0.6717,0.6552,0.2568,0.3689,0.7307
svm,0.6302,0.4746,0.4498,0.4533,0.6225,0.5859,0.4487,0.473,0.4605,0.6012
dt,0.6133,0.4428,0.4622,0.4513,0.5838,0.6212,0.4902,0.3378,0.4,0.6282
rf,0.6613,0.5165,0.3323,0.4004,0.6275,0.6818,0.617,0.3919,0.4793,0.6912
lgbm,0.6614,0.5209,0.3593,0.4177,0.6375,0.6566,0.5682,0.3378,0.4237,0.6592
xgb,0.6471,0.4985,0.3912,0.431,0.6239,0.6162,0.4815,0.3514,0.4062,0.6234
cat,0.631,0.459,0.3447,0.3883,0.6288,0.6667,0.5952,0.3378,0.431,0.6444


In [37]:
res_best['lr']['cv']

Unnamed: 0,cf_matrix,accuracy,precision,recall,f1,roc_auc
0,"[[131, 25], [60, 22]]",0.6429,0.4681,0.2683,0.3411,0.6345
1,"[[126, 30], [51, 31]]",0.6597,0.5082,0.378,0.4336,0.6594
2,"[[143, 13], [62, 19]]",0.6835,0.5938,0.2346,0.3363,0.7065
3,"[[136, 19], [59, 23]]",0.6709,0.5476,0.2805,0.371,0.6489
4,"[[137, 18], [61, 21]]",0.6667,0.5385,0.2561,0.3471,0.6857


In [38]:
tmp = res_best['lr']['cv'].drop(columns=['cf_matrix']).mean(axis=0)
tmp

accuracy    0.6647
precision   0.5312
recall      0.2835
f1          0.3658
roc_auc     0.6670
dtype: float64

In [39]:
tmp['model'] = key
tmp

accuracy    0.6647
precision   0.5312
recall      0.2835
f1          0.3658
roc_auc     0.6670
model          cat
dtype: object

In [40]:
tmp_test = res_best['lr']['results']
tmp_test

{'cf_matrix': array([[114,  10],
        [ 55,  19]]),
 'accuracy': 0.6717171717171717,
 'precision': 0.6551724137931034,
 'recall': 0.25675675675675674,
 'f1': 0.36893203883495146,
 'roc_auc': 0.7307105492589364,
 'model': 'lr'}

In [41]:
tmp_test['model'] = key

tmp_test

{'cf_matrix': array([[114,  10],
        [ 55,  19]]),
 'accuracy': 0.6717171717171717,
 'precision': 0.6551724137931034,
 'recall': 0.25675675675675674,
 'f1': 0.36893203883495146,
 'roc_auc': 0.7307105492589364,
 'model': 'cat'}

In [42]:
# res_grid_cv = {}
# for key in res_grid.keys():
#     g = res_grid[key]
#     res_grid_cv[key] = pd.DataFrame(g.cv_results_)