### 1. 필요한 라이브러리 호출 및 시각화 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
%config InlineBackend.figure_format = 'retina'


### 2. 데이터셋 불러오기

In [2]:
train_df = pd.read_csv('../DATA/quater_features.csv', index_col=0)
test_df = pd.read_csv("../DATA/test_features.csv", index_col=0)

### 3. 데이터셋 분리

In [3]:
# train/test셋 분리
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

### 4. 피처셀렉
* 로지스틱 기반 피처셀렉
* 라쏘

In [4]:
# # 로지스틱 기반 피처 셀렉
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectFromModel
# import statsmodels.api as sm
# import numpy as np
# lr_clf = LogisticRegression()


# logit = SelectFromModel(LogisticRegression())
# logit.fit(X_train, y_train)
# logit_support = logit.get_support()
# lr_feature = X_train.loc[:,logit_support].columns.tolist()

In [5]:
# print(lr_feature)
# print(len(lr_feature))

In [6]:
# lr_feature=[['NAS 당일변화량', 'S&P 당일변화량']]

In [7]:
# # 라쏘
# import numpy as np
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import r2_score, mean_squared_error
# import warnings
# warnings.simplefilter('ignore')

# lasso_model = LogisticRegression()
# param_grid = {'penalty' : ['l1'], 
#                 'C' : [0.001, 0.01, 0.1, 1, 2, 5, 10],
#                 'solver' : ['liblinear']}

# grid_search = GridSearchCV(lasso_model, param_grid=param_grid, return_train_score=True, cv=5)
# grid_search.fit(X_train, y_train)

# df = pd.DataFrame(grid_search.cv_results_)
# df = df.sort_values(by=['rank_test_score'], ascending=True)
# df[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
# print('GridSearchCV 최적 파라미터:', grid_search.best_params_)
# print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_search.best_score_))

In [8]:
# lasso_best = LogisticRegression(C=5, penalty='l1', solver='liblinear').fit(X_train, y_train)

# df_lasso = pd.DataFrame()
# df_lasso['feature'] = X_train.columns
# df_lasso['coef'] = lasso_best.coef_[0]
# df_lasso.drop(df_lasso[df_lasso['coef']==0].index, inplace=True)
# df_lasso

In [9]:
# # 라쏘에서 선택된 피처
# lasso = df_lasso['feature'].values.tolist()
# print('Lasso에서 선택된 피처 수 {0:1.0f}'.format(len(df_lasso)), '개')
# lasso

In [10]:
# list_logistic = list(lr_feature)
# list_lasso = list(lasso)
# list_col_all = X_train.columns

In [11]:
# def func_logistic(x):
#     if x in list_logistic:
#         return 1
#     else:
#         return 0

    
# def func_lasso(x):
#     if x in list_lasso:
#         return 1
#     else:
#         return 0

In [12]:
# # 2번 선택된 Feature

# feature_counts = pd.DataFrame()
# feature_counts['Feature'] = list_col_all
# feature_counts['logistic'] = list_col_all.map(func_logistic)
# feature_counts['lasso'] = list_col_all.map(func_lasso)

# feature_counts["total"] = feature_counts["logistic"]+feature_counts['lasso']
# feature_final = feature_counts[feature_counts["total"]>=2]
# list_feature_final = list(feature_final["Feature"])
# print("선택된 피쳐수 :", len(list_feature_final))
# feature_final['Feature'].to_list()

In [13]:
# 스탠다드로 선택된 피처
selected_feature = ['시장베이시스',
 '원위안 야간변동율',
 'KOSPI 전날변동율',
 'KS200 야간변동율',
 'NAS 당일변동율',
 'CALL_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변화량',
 'VIX 당일변화량',
 'JNIV 종가변동율',
 'JNIV 전날변동율',
 'JNIV 전날변화량',
 'CD 전날변동율']

### 5. 오버샘플링 및 모델링

In [14]:
X_train = X_train[['시장베이시스',
 '원위안 야간변동율',
 'KOSPI 전날변동율',
 'KS200 야간변동율',
 'NAS 당일변동율',
 'CALL_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변화량',
 'VIX 당일변화량',
 'JNIV 종가변동율',
 'JNIV 전날변동율',
 'JNIV 전날변화량',
 'CD 전날변동율']]
X_val = X_val[['시장베이시스',
 '원위안 야간변동율',
 'KOSPI 전날변동율',
 'KS200 야간변동율',
 'NAS 당일변동율',
 'CALL_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변화량',
 'VIX 당일변화량',
 'JNIV 종가변동율',
 'JNIV 전날변동율',
 'JNIV 전날변화량',
 'CD 전날변동율']]

### 모델링

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# SMOTE를 사용하여 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델 초기화
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

# 모델 목록
models = {
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 학습 및 평가
for name, model in models.items():
    # 모델 학습
    model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)

[LightGBM] [Info] Number of positive: 55, number of negative: 55
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
           Model  Accuracy  Precision  Recall  F1 Score
0  Decision Tree  0.473684   0.333333   0.250  0.285714
1  Random Forest  0.526316   0.333333   0.125  0.181818
2        XGBoost  0.368421   0.000000   0.000  0.000000
3       LightGBM  0.421053   0.200000   0.125  0.153846


In [16]:
train_df

Unnamed: 0,시장베이시스,이론베이시스,괴리율,원위안 야간변동율,원엔 야간변동율,KOSPI 전날변동율,KOSPI 야간변동율,KS200 전날변동율,KS200 야간변동율,NAS 야간변동율,...,JNIV 종가변동율,JNIV 전날변동율,JNIV 전날변화량,CD 전날변동율,CD 전날변화량,VKOSPI_Label,Decision Tree_pred,Random Forest_pred,XGBoost_pred,LightGBM_pred
2023-06-12,2.71,2.38,0.09,0.044,0.003,1.16,0.240,1.18,0.216,0.16,...,5.37,4.137,1.77,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-13,2.08,2.25,-0.05,0.050,0.029,-0.45,0.574,-0.64,0.682,1.53,...,-6.78,1.362,0.91,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-14,2.01,2.24,-0.07,0.045,0.037,0.33,0.108,0.53,0.029,0.83,...,-4.82,1.935,1.51,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-15,1.75,2.18,-0.12,0.062,0.033,-0.72,0.388,-0.50,0.371,0.39,...,4.18,-1.924,1.18,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-16,1.96,2.14,-0.05,0.050,0.060,-0.40,0.576,-0.42,0.554,1.15,...,2.71,1.758,1.59,0.000,0.00,0.0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,1.52,1.96,-0.13,0.137,0.090,-0.49,-0.085,-0.13,-0.021,0.45,...,-2.30,0.848,0.62,0.000,0.00,1.0,1.0,1.0,1,1.0
2023-09-27,1.45,2.71,-0.38,0.124,0.112,-1.31,-0.608,-1.41,-0.566,-1.57,...,-1.23,1.616,0.99,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-10-04,2.29,2.67,-0.12,0.075,0.078,0.09,-1.188,-0.06,-1.255,-1.87,...,2.61,-3.772,0.90,0.000,0.00,1.0,1.0,1.0,1,1.0
2023-10-05,0.65,2.38,-0.54,0.059,0.056,-2.41,0.734,-2.22,0.629,1.35,...,7.52,-8.517,1.43,0.261,-0.01,1.0,1.0,1.0,1,1.0


In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

# 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델별 하이퍼파라미터 그리드 설정
param_grid_dtree = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_rforest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 모델 목록
models = {
    'Decision Tree': [DecisionTreeClassifier(random_state=42), param_grid_dtree],
    'Random Forest': [RandomForestClassifier(random_state=42), param_grid_rforest],
    'XGBoost': [XGBClassifier(random_state=42), param_grid_xgb],
    'LightGBM': [LGBMClassifier(random_state=42), param_grid_lgbm]
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 GridSearchCV 실행 및 평가
for name, (model, params) in models.items():
    clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
    clf.fit(X_train_res, y_train_res)
    
    # 최적의 하이퍼파라미터 출력
    print(f"Best parameters for {name}: {clf.best_params_}")
    
    # 최적의 하이퍼파라미터로 모델 생성
    best_model = clone(model)
    best_model.set_params(**clf.best_params_)
    best_model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = best_model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = best_model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 2}
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300}
[LightGBM] [Info] Number of positive: 44, number of negative: 44
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the train set: 88, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 44, number of negative: 44
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 407
[LightGBM] [Info] Number of data points in the 

In [18]:
train_df

Unnamed: 0,시장베이시스,이론베이시스,괴리율,원위안 야간변동율,원엔 야간변동율,KOSPI 전날변동율,KOSPI 야간변동율,KS200 전날변동율,KS200 야간변동율,NAS 야간변동율,...,JNIV 종가변동율,JNIV 전날변동율,JNIV 전날변화량,CD 전날변동율,CD 전날변화량,VKOSPI_Label,Decision Tree_pred,Random Forest_pred,XGBoost_pred,LightGBM_pred
2023-06-12,2.71,2.38,0.09,0.044,0.003,1.16,0.240,1.18,0.216,0.16,...,5.37,4.137,1.77,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-13,2.08,2.25,-0.05,0.050,0.029,-0.45,0.574,-0.64,0.682,1.53,...,-6.78,1.362,0.91,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-14,2.01,2.24,-0.07,0.045,0.037,0.33,0.108,0.53,0.029,0.83,...,-4.82,1.935,1.51,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-15,1.75,2.18,-0.12,0.062,0.033,-0.72,0.388,-0.50,0.371,0.39,...,4.18,-1.924,1.18,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-06-16,1.96,2.14,-0.05,0.050,0.060,-0.40,0.576,-0.42,0.554,1.15,...,2.71,1.758,1.59,0.000,0.00,0.0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,1.52,1.96,-0.13,0.137,0.090,-0.49,-0.085,-0.13,-0.021,0.45,...,-2.30,0.848,0.62,0.000,0.00,1.0,1.0,1.0,1,1.0
2023-09-27,1.45,2.71,-0.38,0.124,0.112,-1.31,-0.608,-1.41,-0.566,-1.57,...,-1.23,1.616,0.99,0.000,0.00,0.0,0.0,0.0,0,0.0
2023-10-04,2.29,2.67,-0.12,0.075,0.078,0.09,-1.188,-0.06,-1.255,-1.87,...,2.61,-3.772,0.90,0.000,0.00,1.0,1.0,1.0,1,1.0
2023-10-05,0.65,2.38,-0.54,0.059,0.056,-2.41,0.734,-2.22,0.629,1.35,...,7.52,-8.517,1.43,0.261,-0.01,1.0,1.0,1.0,1,1.0
