### 1. 필요한 라이브러리 호출 및 시각화 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
%config InlineBackend.figure_format = 'retina'


### 2. 데이터셋 불러오기

In [2]:
train_df = pd.read_csv('../DATA/half_features.csv', index_col=0)
test_df = pd.read_csv("../DATA/test_features.csv", index_col=0)

### 3. 데이터셋 분리

In [3]:
# train/test셋 분리
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

### 4. 피처셀렉
* 도메인 + 상관관계
* Feature importance

In [5]:
# 도메인 지식과 상관관계에서 셀렉된 피처
X_train = X_train[['이론베이시스','시장베이시스','VIX 당일변화량', 'VIX 전날변동율', 'NAS 야간변동율', 'S&P 당일변화량', 'CALL_vol_change(%)','CALL_vol_fluc','KS200 전날변동율','KOSPI 전날변동율']]

# Feature_Importances
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)
rfc.feature_importances_

ftr_importances = pd.Series(rfc.feature_importances_, index=X_train.columns)
sorted_feature_importance = ftr_importances.sort_values(ascending=True)

data = sorted_feature_importance
importance = pd.DataFrame(data, columns=['feature importances'])
importance

Unnamed: 0,feature importances
CALL_vol_fluc,0.071446
시장베이시스,0.079136
KS200 전날변동율,0.080207
VIX 전날변동율,0.093128
CALL_vol_change(%),0.097792
NAS 야간변동율,0.101134
S&P 당일변화량,0.11275
KOSPI 전날변동율,0.114873
이론베이시스,0.117002
VIX 당일변화량,0.132531


* Feature importance를 통해 
1. 'VIX 당일변화량'과 'VIX 전날변동율' 중 중요도 낮은 피처 삭제 -> VIX 전날변동율 제거
2. '이론베이시스'와 '시장베이시스' 중 중요도 낮은 피처 삭제 -> 시장베이시스 제거
3. 중요도 값이 낮은 CALL_vol_fluc도 함께 삭제하기로 함.

selected_feature = ['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']

In [9]:
X_train = X_train[['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']]
X_val = X_val[['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']]

### 5. 오버샘플링 및 모델링

### 모델링

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# SMOTE를 사용하여 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델 초기화
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

# 모델 목록
models = {
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 학습 및 평가
for name, model in models.items():
    # 모델 학습
    model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)

[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
           Model  Accuracy  Precision  Recall  F1 Score
0  Decision Tree  0.578947   0.333333     0.6  0.428571
1  Random Forest  0.578947   0.384615     1.0  0.555556
2        XGBoost  0.684211   0.444444     0.8  0.571429
3       LightGBM  0.578947   0.333333     0.6  0.428571


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

# 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델별 하이퍼파라미터 그리드 설정
param_grid_dtree = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_rforest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 모델 목록
models = {
    'Decision Tree': [DecisionTreeClassifier(random_state=42), param_grid_dtree],
    'Random Forest': [RandomForestClassifier(random_state=42), param_grid_rforest],
    'XGBoost': [XGBClassifier(random_state=42), param_grid_xgb],
    'LightGBM': [LGBMClassifier(random_state=42), param_grid_lgbm]
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 GridSearchCV 실행 및 평가
for name, (model, params) in models.items():
    clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
    clf.fit(X_train_res, y_train_res)
    
    # 최적의 하이퍼파라미터 출력
    print(f"Best parameters for {name}: {clf.best_params_}")
    
    # 최적의 하이퍼파라미터로 모델 생성
    best_model = clone(model)
    best_model.set_params(**clf.best_params_)
    best_model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = best_model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = best_model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 2}
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
[LightGBM] [Info] Number of positive: 78, number of negative: 78
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 156, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 79, number of negative: 78
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 368
[LightGBM] [Info] Number of data points in the 

### 임계값 설정을 위한 혼동행렬

In [13]:
# 성능확인 코드
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score


import warnings

def get_clf_eval(MDA_y_test, pred):
    confusion = confusion_matrix(MDA_y_test, pred)
    accuracy = accuracy_score(MDA_y_test, pred)
    precision = precision_score(MDA_y_test, pred)
    recall = recall_score(MDA_y_test, pred)
    roc_score = roc_auc_score(MDA_y_test, pred)
    pr_score = average_precision_score(MDA_y_test, pred)
    f1 = f1_score(MDA_y_test, pred)
    print('오차행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도 : {1:.4f}, 재현율:{2:.4f},F1 스코어:{3:.4f}'.format(accuracy, precision, recall, f1, roc_score))
    print('ROC 스코어: {0:.4f}, PR score : {1:.4f}'.format(roc_score, pr_score))

In [14]:
# 임계값에 따른 오차행렬및 스코어 -------->#임계값 최적 : 재현율기준(0.1)/f1기준(0.3)
from sklearn.preprocessing import Binarizer
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]

def get_eval_by_threshold(MDA_y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print("임곗값:", custom_threshold)
        get_clf_eval(MDA_y_test, custom_predict)
        print("---------------------------------------------------------")

In [19]:
decision_tree_model.fit(X_train, y_train)
lda_pred = decision_tree_model.predict(X_val)
lda_pred_proba = decision_tree_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
임곗값: 0.1
오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
---------------------------------------------------------
임곗값: 0.2
오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
---------------------------------------------------------
임곗값: 0.3
오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
---------------------------------------------------------
임곗값: 0.4
오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
---------------------------------------------------------
임곗값: 0.5
오차행렬
[[5 9]
 [1 4]]
정확도: 0.4737, 정밀도 : 0.3077, 재현율:0.8000,F1 스코어:0.4444
ROC 스코어: 0.5786, PR score : 0.2988
---------------------------------------------------------
임곗값: 0.6
오차행렬
[[5 9]
 [

In [20]:
random_forest_model.fit(X_train, y_train)
lda_pred = random_forest_model.predict(X_val)
lda_pred_proba = random_forest_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

오차행렬
[[14  0]
 [ 5  0]]
정확도: 0.7368, 정밀도 : 0.0000, 재현율:0.0000,F1 스코어:0.0000
ROC 스코어: 0.5000, PR score : 0.2632
임곗값: 0.1
오차행렬
[[ 0 14]
 [ 0  5]]
정확도: 0.2632, 정밀도 : 0.2632, 재현율:1.0000,F1 스코어:0.4167
ROC 스코어: 0.5000, PR score : 0.2632
---------------------------------------------------------
임곗값: 0.2
오차행렬
[[ 3 11]
 [ 0  5]]
정확도: 0.4211, 정밀도 : 0.3125, 재현율:1.0000,F1 스코어:0.4762
ROC 스코어: 0.6071, PR score : 0.3125
---------------------------------------------------------
임곗값: 0.3
오차행렬
[[7 7]
 [1 4]]
정확도: 0.5789, 정밀도 : 0.3636, 재현율:0.8000,F1 스코어:0.5000
ROC 스코어: 0.6500, PR score : 0.3435
---------------------------------------------------------
임곗값: 0.4
오차행렬
[[10  4]
 [ 5  0]]
정확도: 0.5263, 정밀도 : 0.0000, 재현율:0.0000,F1 스코어:0.0000
ROC 스코어: 0.3571, PR score : 0.2632
---------------------------------------------------------
임곗값: 0.5
오차행렬
[[14  0]
 [ 5  0]]
정확도: 0.7368, 정밀도 : 0.0000, 재현율:0.0000,F1 스코어:0.0000
ROC 스코어: 0.5000, PR score : 0.2632
---------------------------------------------------------
임곗값

In [21]:
lgbm_model.fit(X_train, y_train)
lda_pred = lgbm_model.predict(X_val)
lda_pred_proba = lgbm_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

[LightGBM] [Info] Number of positive: 23, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 275
[LightGBM] [Info] Number of data points in the train set: 121, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190083 -> initscore=-1.449473
[LightGBM] [Info] Start training from score -1.449473
오차행렬
[[12  2]
 [ 5  0]]
정확도: 0.6316, 정밀도 : 0.0000, 재현율:0.0000,F1 스코어:0.0000
ROC 스코어: 0.4286, PR score : 0.2632
임곗값: 0.1
오차행렬
[[ 4 10]
 [ 2  3]]
정확도: 0.3684, 정밀도 : 0.2308, 재현율:0.6000,F1 스코어:0.3333
ROC 스코어: 0.4429, PR score : 0.2437
---------------------------------------------------------
임곗값: 0.2
오차행렬
[[8 6]
 [3 2]]
정확도: 0.5263, 정밀도 : 0.2500, 재현율:0.4000,F1 스코어:0.3077
ROC 스코어: 0.4857, PR score : 0.2579
---------------------------------------------------------
임곗값: 0.3
오차행렬
[[10  4]
 [ 3  2]]
정확도: 0.6316, 정밀도 : 0

In [22]:
xgb_model.fit(X_train, y_train)
lda_pred = xgb_model.predict(X_val)
lda_pred_proba = xgb_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

오차행렬
[[13  1]
 [ 4  1]]
정확도: 0.7368, 정밀도 : 0.5000, 재현율:0.2000,F1 스코어:0.2857
ROC 스코어: 0.5643, PR score : 0.3105
임곗값: 0.1
오차행렬
[[6 8]
 [2 3]]
정확도: 0.4737, 정밀도 : 0.2727, 재현율:0.6000,F1 스코어:0.3750
ROC 스코어: 0.5143, PR score : 0.2689
---------------------------------------------------------
임곗값: 0.2
오차행렬
[[8 6]
 [3 2]]
정확도: 0.5263, 정밀도 : 0.2500, 재현율:0.4000,F1 스코어:0.3077
ROC 스코어: 0.4857, PR score : 0.2579
---------------------------------------------------------
임곗값: 0.3
오차행렬
[[9 5]
 [4 1]]
정확도: 0.5263, 정밀도 : 0.1667, 재현율:0.2000,F1 스코어:0.1818
ROC 스코어: 0.4214, PR score : 0.2439
---------------------------------------------------------
임곗값: 0.4
오차행렬
[[11  3]
 [ 4  1]]
정확도: 0.6316, 정밀도 : 0.2500, 재현율:0.2000,F1 스코어:0.2222
ROC 스코어: 0.4929, PR score : 0.2605
---------------------------------------------------------
임곗값: 0.5
오차행렬
[[13  1]
 [ 4  1]]
정확도: 0.7368, 정밀도 : 0.5000, 재현율:0.2000,F1 스코어:0.2857
ROC 스코어: 0.5643, PR score : 0.3105
---------------------------------------------------------
임곗값: 0.6
오차