In [156]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import numpy as np

import pandas as pd
# 모든 행을 출력하도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 기본값으로 설정 (처음 5개와 마지막 5개 행만 출력)
pd.reset_option('display.max_rows')

# 출력 포맷 설정 (소수점 4자리까지)
pd.options.display.float_format = '{:.4f}'.format

import platform
import seaborn as sns

import matplotlib.pyplot as plt

# 운영 체제 확인
if platform.system() == 'Darwin':  # Mac
    print('apple gothic')
    font_name = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    font_name = 'NanumGothic'
else:
    font_name = None

# 한글 폰트 설정
if font_name:
    plt.rcParams['font.family'] = font_name

# 마이너스 부호 설정
plt.rcParams['axes.unicode_minus'] = False

apple gothic


## 데이터로드

In [157]:
X_train = pd.read_csv('../data/dataset_final/X_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
X_test = pd.read_csv('../data/dataset_final/X_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

y_train = pd.read_csv('../data/dataset_final/y_train.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')
y_test = pd.read_csv('../data/dataset_final/y_test.csv', dtype={'거래소코드' : 'object'}).set_index('거래소코드')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1187, 10), (198, 10), (1187, 1), (198, 1))

In [158]:
X_TRAIN = X_train.copy()
X_TEST = X_test.copy()
y_TRAIN = y_train.copy()
y_TEST = y_test.copy()

## 모델링

### 샘플링 전

In [159]:
import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

[LightGBM] [Info] Number of positive: 409, number of negative: 778
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1187, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344566 -> initscore=-0.643011
[LightGBM] [Info] Start training from score -0.643011
[LightGBM] [Info] Number of positive: 327, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.344573 -> initscore=-0.642980
[LightGBM] [I

In [160]:
results

Unnamed: 0_level_0,cf_matrix,accuracy,precision,recall,f1,roc_auc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,"[[117, 7], [62, 12]]",0.6515,0.6316,0.1622,0.2581,0.7005
dt,"[[84, 40], [38, 36]]",0.6061,0.4737,0.4865,0.48,0.582
svm,"[[122, 2], [69, 5]]",0.6414,0.7143,0.0676,0.1235,0.6533
rf,"[[104, 20], [54, 20]]",0.6263,0.5,0.2703,0.3509,0.6461
xgb,"[[93, 31], [44, 30]]",0.6212,0.4918,0.4054,0.4444,0.6381
lgbm,"[[99, 25], [48, 26]]",0.6313,0.5098,0.3514,0.416,0.644
cat,"[[107, 17], [50, 24]]",0.6616,0.5854,0.3243,0.4174,0.6724


In [161]:
results_cv

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,0.6672,0.5696,0.2127,0.3028,0.6776
dt,0.5763,0.3816,0.3714,0.3744,0.5277
svm,0.6554,0.5132,0.0953,0.1542,0.6198
rf,0.6513,0.4981,0.2982,0.3671,0.6455
xgb,0.6369,0.4707,0.3641,0.4052,0.6291
lgbm,0.6361,0.4694,0.3421,0.3872,0.6351
cat,0.6622,0.532,0.3055,0.3753,0.6504


In [162]:
evals = {}

evals['raw'] = [res, results, results_cv]

### 리샘플링 데이터 생성

In [163]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [164]:
X_train.shape, y_train.shape

((1187, 10), (1187, 1))

In [165]:
y_train['label'].value_counts()

label
0.0000    778
1.0000    409
Name: count, dtype: int64

#### (1) 랜덤언더샘플링

In [166]:
X_train = X_TRAIN
y_train = y_TRAIN

In [167]:
# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
X_rus.shape, y_rus.shape, y_rus.value_counts()

((818, 10),
 (818, 1),
 label 
 0.0000    409
 1.0000    409
 Name: count, dtype: int64)

In [168]:
X_train = X_rus
y_train = y_rus

In [169]:
# import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

[LightGBM] [Info] Number of positive: 409, number of negative: 409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 818, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 327, number of negative: 327
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2064
[LightGBM] [Info] Number of data points in the train set: 654, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 327, number of negative: 327
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overh

In [170]:
evals['under'] = [res, results, results_cv]

#### (2) 랜덤오버샘플링

In [171]:
X_train = X_TRAIN
y_train = y_TRAIN

In [172]:
# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
X_ros.shape, y_ros.shape, y_ros.value_counts()

((1556, 10),
 (1556, 1),
 label 
 0.0000    778
 1.0000    778
 Name: count, dtype: int64)

In [173]:
X_train = X_ros
y_train = y_ros

In [174]:
# import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

evals['over'] = [res, results, results_cv]

[LightGBM] [Info] Number of positive: 778, number of negative: 778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2545
[LightGBM] [Info] Number of data points in the train set: 1556, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 622, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2534
[LightGBM] [Info] Number of data points in the train set: 1244, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 623, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ove

#### (3) SMOTE

In [175]:
X_train = X_TRAIN
y_train = y_TRAIN

In [176]:
# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
X_smote.shape, y_smote.shape, y_smote.value_counts()

((1556, 10),
 (1556, 1),
 label 
 0.0000    778
 1.0000    778
 Name: count, dtype: int64)

In [177]:
X_train = X_smote
y_train = y_smote

In [178]:
# import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

[LightGBM] [Info] Number of positive: 778, number of negative: 778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 1556, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 622, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 1244, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 623, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ove

In [179]:
evals['smote'] = [res, results, results_cv]

#### (4) ADASYN

In [180]:
X_train = X_TRAIN
y_train = y_TRAIN

In [181]:
#ADASYN
#SMOTE는 linear 상의 샘플만 생성하는 단점
#ADASYN은 KNN범위내로 들어오는 개수에 비례하여 생성함. 스무트에 비해 소수인 레이블이 덜 무시되도록 샘플생성

from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_ada, y_ada = adasyn.fit_resample(X_train, y_train)

y_ada.value_counts()

label 
1.0000    822
0.0000    778
Name: count, dtype: int64

In [182]:
X_train = X_ada
y_train = y_ada

In [183]:
# import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

[LightGBM] [Info] Number of positive: 822, number of negative: 778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513750 -> initscore=0.055014
[LightGBM] [Info] Start training from score 0.055014
[LightGBM] [Info] Number of positive: 658, number of negative: 622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.514062 -> initscore=0.056265
[LightGBM] [Info] Start training from score 0.056265
[LightGBM] [Info] Number o

In [184]:
evals['ada'] = [res, results, results_cv]

#### (5) SMTK

In [185]:
X_train = X_TRAIN
y_train = y_TRAIN

In [186]:
# SMOTE-Tomek
# 오버샘플링과 언더샘플링을 동시에 수행
# 스무트 후 토멕함
# Tomek: 소수레이블과 가까운 다수레이블을 제외시키면서 샘플 생성, 즉 모호한 경계선을 명확히 구분시켜줌

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

smotetomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_smtk, y_smtk = smotetomek.fit_resample(X_train, y_train)
y_smtk.value_counts()

label 
1.0000    778
0.0000    736
Name: count, dtype: int64

In [187]:
X_train = X_smtk
y_train = y_smtk

In [188]:
# import modules.modeling as ml

names = ['lr', 'dt', 'svm', 'rf', 'xgb', 'lgbm', 'cat']
res = {}
for name in names:
    res[name] = ml.train(X_train, y_train, X_test, y_test, model_name=name)


results, results_cv = [], []
for key in res.keys():
    res[key]['results']['name'] = key
    results.append(res[key]['results'])

    res_cv = dict(res[key]['cv'].drop(columns='cf_matrix').mean())
    res_cv['name'] = key
    results_cv.append(res_cv)

results = pd.DataFrame(results).set_index('name')
results_cv = pd.DataFrame(results_cv).set_index('name')

[LightGBM] [Info] Number of positive: 778, number of negative: 736
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 1514, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.513871 -> initscore=0.055496
[LightGBM] [Info] Start training from score 0.055496
[LightGBM] [Info] Number of positive: 623, number of negative: 588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 1211, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.514451 -> initscore=0.057820
[LightGBM] [Info] Start training from score 0.057820
[LightGBM] [Info] Number o

In [189]:
evals['smtk'] = [res, results, results_cv]

#### 리샘플링 데이터 저장

In [190]:
X_ros.shape, y_ros.shape

((1556, 10), (1556, 1))

In [191]:
y_ros.value_counts()

label 
0.0000    778
1.0000    778
Name: count, dtype: int64

In [192]:
X_ros.to_csv('../data/dataset_final_resampling/X_train_over.csv', index=None)
y_ros.to_csv('../data/dataset_final_resampling/y_train_over.csv', index=None)

In [193]:
X_smtk.shape, y_smtk.shape

((1514, 10), (1514, 1))

In [194]:
y_smtk.value_counts()

label 
1.0000    778
0.0000    736
Name: count, dtype: int64

In [195]:
X_TRAIN.shape, y_TRAIN.value_counts()

((1187, 10),
 label 
 0.0000    778
 1.0000    409
 Name: count, dtype: int64)

In [196]:
X_smtk.to_csv('../data/dataset_final_resampling/X_train_smtk.csv', index=None)
y_smtk.to_csv('../data/dataset_final_resampling/y_train_smtk.csv', index=None)

In [197]:
778 + 409

1187

### 성능비교

In [198]:
evals.keys()

dict_keys(['raw', 'under', 'over', 'smote', 'ada', 'smtk'])

In [199]:
idx = 0
key = list(evals.keys())[idx]

print(key)

# test
evals[key][1]

raw


Unnamed: 0_level_0,cf_matrix,accuracy,precision,recall,f1,roc_auc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,"[[117, 7], [62, 12]]",0.6515,0.6316,0.1622,0.2581,0.7005
dt,"[[84, 40], [38, 36]]",0.6061,0.4737,0.4865,0.48,0.582
svm,"[[122, 2], [69, 5]]",0.6414,0.7143,0.0676,0.1235,0.6533
rf,"[[104, 20], [54, 20]]",0.6263,0.5,0.2703,0.3509,0.6461
xgb,"[[93, 31], [44, 30]]",0.6212,0.4918,0.4054,0.4444,0.6381
lgbm,"[[99, 25], [48, 26]]",0.6313,0.5098,0.3514,0.416,0.644
cat,"[[107, 17], [50, 24]]",0.6616,0.5854,0.3243,0.4174,0.6724


In [200]:
# train + cv
evals[key][2]

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lr,0.6672,0.5696,0.2127,0.3028,0.6776
dt,0.5763,0.3816,0.3714,0.3744,0.5277
svm,0.6554,0.5132,0.0953,0.1542,0.6198
rf,0.6513,0.4981,0.2982,0.3671,0.6455
xgb,0.6369,0.4707,0.3641,0.4052,0.6291
lgbm,0.6361,0.4694,0.3421,0.3872,0.6351
cat,0.6622,0.532,0.3055,0.3753,0.6504


In [201]:
all_trainCV = []
all_test = []

for key in evals.keys():
    df_test = evals[key][1]
    df_test['sampling'] = key
    all_test.append(df_test)

    df_trainCV = evals[key][2]
    df_trainCV['sampling'] = key
    all_trainCV.append(df_trainCV)

all_trainCV = pd.concat(all_trainCV)
all_test = pd.concat(all_test)

In [202]:
all_trainCV_prec = all_trainCV.sort_values('precision', ascending=False).reset_index(drop=False)

all_trainCV_prec.head()

Unnamed: 0,name,accuracy,precision,recall,f1,roc_auc,sampling
0,rf,0.7841,0.7479,0.8546,0.7948,0.898,over
1,xgb,0.7835,0.7456,0.8558,0.7949,0.87,over
2,lgbm,0.7815,0.7412,0.861,0.7946,0.8629,over
3,rf,0.7306,0.7183,0.7828,0.7451,0.8166,smtk
4,cat,0.741,0.7068,0.8224,0.7583,0.8318,over


In [203]:
all_trainCV_prec.groupby('name').first()

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc,sampling
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cat,0.741,0.7068,0.8224,0.7583,0.8318,over
dt,0.7301,0.683,0.8494,0.7547,0.7299,over
lgbm,0.7815,0.7412,0.861,0.7946,0.8629,over
lr,0.6234,0.6168,0.6528,0.634,0.6542,under
rf,0.7841,0.7479,0.8546,0.7948,0.898,over
svm,0.6414,0.6179,0.7905,0.6931,0.6844,smtk
xgb,0.7835,0.7456,0.8558,0.7949,0.87,over


In [204]:
all_trainCV_prec_best = all_trainCV_prec.groupby('name').first().loc[['lr', 'svm', 'dt', 'rf', 'lgbm', 'xgb', 'cat']].reset_index(drop=False).set_index(['name', 'sampling'])
all_trainCV_prec_best_idx = all_trainCV_prec_best.index

all_trainCV_prec_best

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1,roc_auc
name,sampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,under,0.6234,0.6168,0.6528,0.634,0.6542
svm,smtk,0.6414,0.6179,0.7905,0.6931,0.6844
dt,over,0.7301,0.683,0.8494,0.7547,0.7299
rf,over,0.7841,0.7479,0.8546,0.7948,0.898
lgbm,over,0.7815,0.7412,0.861,0.7946,0.8629
xgb,over,0.7835,0.7456,0.8558,0.7949,0.87
cat,over,0.741,0.7068,0.8224,0.7583,0.8318


In [205]:
# test
all_test.reset_index(drop=False).set_index(['name', 'sampling']).loc[all_trainCV_prec_best_idx].drop(columns=['cf_matrix'])

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,f1,roc_auc
name,sampling,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lr,under,0.6465,0.5196,0.7162,0.6023,0.6951
svm,smtk,0.6162,0.4909,0.7297,0.587,0.6602
dt,over,0.5,0.3134,0.2838,0.2979,0.4564
rf,over,0.6111,0.4727,0.3514,0.4031,0.64
lgbm,over,0.6061,0.4643,0.3514,0.4,0.6223
xgb,over,0.601,0.4603,0.3919,0.4234,0.6164
cat,over,0.6414,0.5231,0.4595,0.4892,0.6538


In [206]:
best_prec_method = all_trainCV_prec_best.sort_values('precision', ascending=False).index[0]
print(best_prec_method)

('rf', 'over')


In [207]:
# all_trainCV_prec_best.sort_values('f1', ascending=False)
best_f1_method = all_trainCV_prec_best.sort_values('f1', ascending=False).index[0]
best_f1_method

('xgb', 'over')

In [208]:
evals['over'][0]['xgb']['model']

## 최종 모델 선정 및 포트폴리오 수익률 계산

In [209]:
# train, test 결합

X_train, y_train = X_TRAIN, y_TRAIN

X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)
X.shape, y.shape, X.index.equals(y.index)

((1385, 10), (1385, 1), True)

In [210]:
info = pd.read_csv('../data/6_데이터셋_불필요컬럼제거.csv', dtype={'거래소코드':'object'})[['회사명', '거래소코드', '회계년도', 'label']].set_index('거래소코드')

info.head()

Unnamed: 0_level_0,회사명,회계년도,label
거래소코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40,케이알모터스(주),2012/12,0.0
50,(주)경방,2012/12,1.0
70,(주)삼양홀딩스,2013/12,1.0
150,(주)두산,2015/12,1.0
180,성창기업지주(주),2012/12,0.0


In [211]:
from pykrx import stock

fname='m1_2019_2021_resampling'

best_methods = {'f1' : best_f1_method, 'prec' : best_prec_method}
best_models = {key: evals[method[1]][0][method[0]]['model'] for key, method in best_methods.items()}

# 포트폴리오 생성
best_portfolios = {}
dname = '../../quant/portfolios'
for key, model in best_models.items():
    pf = y.copy()
    
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    pf['pred'] = y_pred
    pf['proba1'] = y_pred_proba

    pf = pd.merge(
        info[['회사명', '회계년도']], pf,
        left_index=True, right_index=True,
        how='left'
    )

    best_portfolios[key] = pf

    # 파일로 저장
    pf.to_csv(dname+'/'+fname+'_'+key+'_'+ best_methods[key][0] + '_' + best_methods[key][1]+'.csv')

best_rtns = {}
dname = '../../quant/rtns'
for key, pf in best_portfolios.items():
    print(key)
    
    # 1로 예측된 기업만 투자
    pf = pf[pf['pred']==1]
    pf['year'] = pd.to_datetime(pf['회계년도']).dt.year
    
    rtn = []
    for year in pf['year'].unique():
        tickers = pf[pf['year']==year].index

        print(f'{year}년, 투자종목수:{len(tickers)}')

        df_price = pd.DataFrame()

        # 개별 종목 종가 로드 및 결합
        for ticker in tickers:
            price = stock.get_market_ohlcv(f'{year+1}0401', f'{year+2}0331', ticker)[['종가']]
            price.rename(columns = {'종가' : ticker}, inplace=True)

            # index : datetime
            df_price = pd.merge(
                df_price, price,
                left_index=True, right_index=True,
                how='outer'
            )

        # 포트폴리오 종가 계산
        df_price['sum'] = df_price.sum(axis=1)
        try:
            df_price['수익률'] = df_price['sum'].pct_change().fillna(0)
            df_price['year'] = year
            rtn.append(df_price[['year', 'sum', '수익률']])
        except:
            print(f'{year}년도는 투자 X')
            continue

    rtn = pd.concat(rtn, axis=0)
    rtn = rtn.sort_index()
    best_rtns[key] = rtn

    # 파일로 저장
    rtn.to_csv(dname+'/'+fname+'_'+key+'_'+ best_methods[key][0] + '_' + best_methods[key][1] +'.csv')


f1
2012년, 투자종목수:133


2013년, 투자종목수:61
2015년, 투자종목수:41
2021년, 투자종목수:27
2020년, 투자종목수:47
2017년, 투자종목수:33
2019년, 투자종목수:40
2014년, 투자종목수:55
2018년, 투자종목수:48
2016년, 투자종목수:32
prec
2012년, 투자종목수:131
2013년, 투자종목수:61
2015년, 투자종목수:43
2021년, 투자종목수:24
2020년, 투자종목수:47
2017년, 투자종목수:33
2019년, 투자종목수:42
2014년, 투자종목수:57
2018년, 투자종목수:48
2016년, 투자종목수:32


In [212]:
dname + '/' + fname + '_' + key + '_' + best_methods[key][0] + '_' + best_methods[key][1]+'.csv'

'../../quant/rtns/m1_2019_2021_resampling_prec_rf_over.csv'

In [213]:
X_TRAIN.shape, y_TRAIN.value_counts()

((1187, 10),
 label 
 0.0000    778
 1.0000    409
 Name: count, dtype: int64)

In [214]:
X_rus.shape, y_rus.value_counts()

((818, 10),
 label 
 0.0000    409
 1.0000    409
 Name: count, dtype: int64)

In [215]:
X_ros.shape, y_ros.value_counts()

((1556, 10),
 label 
 0.0000    778
 1.0000    778
 Name: count, dtype: int64)

In [216]:
X_smote.shape, y_smote.value_counts()

((1556, 10),
 label 
 0.0000    778
 1.0000    778
 Name: count, dtype: int64)

In [217]:
X_ada.shape, y_ada.value_counts()

((1600, 10),
 label 
 1.0000    822
 0.0000    778
 Name: count, dtype: int64)

In [219]:
822 / 1600

0.51375

In [220]:
778 / 1600

0.48625

In [222]:
0.514 + 0.486

1.0

In [218]:
X_smtk.shape, y_smtk.value_counts()

((1514, 10),
 label 
 1.0000    778
 0.0000    736
 Name: count, dtype: int64)

In [223]:
778 / 1514

0.5138705416116248

In [224]:
736 / 1514

0.4861294583883752

In [225]:
51.4 + 49.6

101.0