### 1. 필요한 라이브러리 호출 및 시각화 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
%config InlineBackend.figure_format = 'retina'


### 2. 데이터셋 불러오기

In [2]:
train_df = pd.read_csv('../DATA/half_features.csv', index_col=0)
test_df = pd.read_csv("../DATA/test_features.csv", index_col=0)

### 3. 스케일링 및 데이터셋 분리

In [3]:
# train/test셋 분리
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

In [4]:
# 표준화
features_to_standardize = ['시장베이시스', '이론베이시스', '괴리율', '원위안 야간변동율', '원엔 야간변동율', 'KOSPI 전날변동율',
       'KOSPI 야간변동율', 'KS200 전날변동율', 'KS200 야간변동율', 'NAS 야간변동율', 'NAS 당일변동율',
       'NAS 당일변화량', 'P/C Ratio', 'CALL_vol_change(%)', 'PUT_vol_change(%)',
       'CALL_vol_fluc', 'PUT_vol_fluc', 'S&P 야간변동율', 'S&P 당일변동율', 'S&P 당일변화량',
       '원달러 야간변동율', 'VIX 전날변동율', 'VIX 당일변동율', 'VIX 당일변화량', 'JNIV 종가변동율',
       'JNIV 전날변동율', 'JNIV 전날변화량', 'CD 전날변동율', 'CD 전날변화량']
scaler_standardize = StandardScaler()
X_train = scaler_standardize.fit_transform(X_train[features_to_standardize])
X_val = scaler_standardize.transform(X_val[features_to_standardize])

X_train = pd.DataFrame(X_train, columns=features_to_standardize)
X_val = pd.DataFrame(X_val, columns=features_to_standardize)

### 4. 피처셀렉
* 로지스틱 기반 피처셀렉
* 라쏘

In [5]:
# 로지스틱 기반 피처 셀렉
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()


logit = SelectFromModel(LogisticRegression())
logit.fit(X_train, y_train)
logit_support = logit.get_support()
lr_feature = X_train.loc[:,logit_support].columns.tolist()

In [6]:
print(lr_feature)
print(len(lr_feature))

['원위안 야간변동율', 'NAS 야간변동율', 'NAS 당일변동율', 'CALL_vol_change(%)', 'PUT_vol_change(%)', 'CALL_vol_fluc', 'S&P 야간변동율', 'S&P 당일변동율', 'S&P 당일변화량', '원달러 야간변동율', 'VIX 당일변동율', 'CD 전날변화량']
12


In [7]:
lr_feature=['원위안 야간변동율', 'NAS 야간변동율', 'NAS 당일변동율', 'CALL_vol_change(%)', 'PUT_vol_change(%)', 'CALL_vol_fluc', 'S&P 야간변동율', 'S&P 당일변동율', 'S&P 당일변화량', '원달러 야간변동율', 'VIX 당일변동율', 'CD 전날변화량']

In [8]:
# 라쏘
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.simplefilter('ignore')

lasso_model = LogisticRegression()
param_grid = {'penalty' : ['l1'], 
                'C' : [0.001, 0.01, 0.1, 1, 2, 5, 10],
                'solver' : ['liblinear']}

grid_search = GridSearchCV(lasso_model, param_grid=param_grid, return_train_score=True, cv=5)
grid_search.fit(X_train, y_train)

df = pd.DataFrame(grid_search.cv_results_)
df = df.sort_values(by=['rank_test_score'], ascending=True)
df[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
print('GridSearchCV 최적 파라미터:', grid_search.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_search.best_score_))

GridSearchCV 최적 파라미터: {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
GridSearchCV 최고 정확도:0.7107


In [9]:
lasso_best = LogisticRegression(C=5, penalty='l1', solver='liblinear').fit(X_train, y_train)

df_lasso = pd.DataFrame()
df_lasso['feature'] = X_train.columns
df_lasso['coef'] = lasso_best.coef_[0]
df_lasso.drop(df_lasso[df_lasso['coef']==0].index, inplace=True)
df_lasso

Unnamed: 0,feature,coef
0,시장베이시스,0.293376
2,괴리율,-0.182068
3,원위안 야간변동율,-0.965486
4,원엔 야간변동율,-0.256034
5,KOSPI 전날변동율,-0.067949
7,KS200 전날변동율,0.145523
8,KS200 야간변동율,0.178374
9,NAS 야간변동율,-0.058951
11,NAS 당일변화량,0.153084
12,P/C Ratio,0.17199


In [10]:
# 라쏘에서 선택된 피처
lasso = df_lasso['feature'].values.tolist()
print('Lasso에서 선택된 피처 수 {0:1.0f}'.format(len(df_lasso)), '개')
lasso

Lasso에서 선택된 피처 수 25 개


['시장베이시스',
 '괴리율',
 '원위안 야간변동율',
 '원엔 야간변동율',
 'KOSPI 전날변동율',
 'KS200 전날변동율',
 'KS200 야간변동율',
 'NAS 야간변동율',
 'NAS 당일변화량',
 'P/C Ratio',
 'CALL_vol_change(%)',
 'PUT_vol_change(%)',
 'CALL_vol_fluc',
 'PUT_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변동율',
 'S&P 당일변화량',
 '원달러 야간변동율',
 'VIX 전날변동율',
 'VIX 당일변동율',
 'VIX 당일변화량',
 'JNIV 종가변동율',
 'JNIV 전날변동율',
 'JNIV 전날변화량',
 'CD 전날변화량']

In [11]:
list_logistic = list(lr_feature)
list_lasso = list(lasso)
list_col_all = X_train.columns

In [12]:
def func_logistic(x):
    if x in list_logistic:
        return 1
    else:
        return 0

    
def func_lasso(x):
    if x in list_lasso:
        return 1
    else:
        return 0

In [13]:
# 2번 선택된 Feature

feature_counts = pd.DataFrame()
feature_counts['Feature'] = list_col_all
feature_counts['logistic'] = list_col_all.map(func_logistic)
feature_counts['lasso'] = list_col_all.map(func_lasso)

feature_counts["total"] = feature_counts["logistic"]+feature_counts['lasso']
feature_final = feature_counts[feature_counts["total"]>=2]
list_feature_final = list(feature_final["Feature"])
print("선택된 피쳐수 :", len(list_feature_final))
feature_final['Feature'].to_list()

선택된 피쳐수 : 11


['원위안 야간변동율',
 'NAS 야간변동율',
 'CALL_vol_change(%)',
 'PUT_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변동율',
 'S&P 당일변화량',
 '원달러 야간변동율',
 'VIX 당일변동율',
 'CD 전날변화량']

In [14]:
selected_feature = ['원위안 야간변동율',
 'NAS 야간변동율',
 'CALL_vol_change(%)',
 'PUT_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변동율',
 'S&P 당일변화량',
 '원달러 야간변동율',
 'VIX 당일변동율',
 'CD 전날변화량']

### 5. 오버샘플링 및 모델링

In [15]:
X_train = X_train[['원위안 야간변동율',
 'NAS 야간변동율',
 'CALL_vol_change(%)',
 'PUT_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변동율',
 'S&P 당일변화량',
 '원달러 야간변동율',
 'VIX 당일변동율',
 'CD 전날변화량']]
X_val = X_val[['원위안 야간변동율',
 'NAS 야간변동율',
 'CALL_vol_change(%)',
 'PUT_vol_change(%)',
 'CALL_vol_fluc',
 'S&P 야간변동율',
 'S&P 당일변동율',
 'S&P 당일변화량',
 '원달러 야간변동율',
 'VIX 당일변동율',
 'CD 전날변화량']]

### 모델링

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# SMOTE를 사용하여 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())  # y_train을 1D array로 변환

# 모델 초기화
svc_model = SVC()
logreg_model = LogisticRegression()

# 모델 목록
models = {
    'SVC': svc_model,
    'Logistic Regression': logreg_model
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 학습 및 평가
for name, model in models.items():
    # 모델 학습
    model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


                 Model  Accuracy  Precision  Recall  F1 Score
0                  SVC  0.684211       0.75   0.375  0.500000
1  Logistic Regression  0.631579       1.00   0.125  0.222222


In [17]:
train_df

Unnamed: 0,시장베이시스,이론베이시스,괴리율,원위안 야간변동율,원엔 야간변동율,KOSPI 전날변동율,KOSPI 야간변동율,KS200 전날변동율,KS200 야간변동율,NAS 야간변동율,...,VIX 당일변동율,VIX 당일변화량,JNIV 종가변동율,JNIV 전날변동율,JNIV 전날변화량,CD 전날변동율,CD 전날변화량,VKOSPI_Label,SVC_pred,Logistic Regression_pred
2023-04-10,1.49,1.97,-0.15,0.037,0.029,1.27,0.051,1.74,-0.028,0.76,...,0.000,0.00,0.57,1.257,0.46,-1.397,0.05,1.0,1.0,0.0
2023-04-11,1.57,1.86,-0.09,0.042,0.038,0.87,0.448,0.93,0.496,-0.03,...,-2.166,1.12,-2.37,0.698,0.47,-0.567,0.02,0.0,0.0,1.0
2023-04-12,1.28,1.83,-0.17,0.036,0.051,1.42,-0.059,1.38,0.003,-0.43,...,0.105,0.72,-0.92,2.758,0.74,-0.855,0.03,1.0,1.0,1.0
2023-04-13,1.36,1.79,-0.13,0.062,0.044,0.11,-0.625,0.37,-0.611,-0.85,...,-1.496,1.73,-1.40,-0.118,0.27,-1.437,0.05,0.0,0.0,0.0
2023-04-14,1.21,1.75,-0.16,0.058,0.054,0.43,0.720,0.30,0.700,1.99,...,-5.470,1.29,-0.77,-1.411,0.65,0.000,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,1.52,1.96,-0.13,0.137,0.090,-0.49,-0.085,-0.13,-0.021,0.45,...,-2.029,1.62,-2.30,0.848,0.62,0.000,0.00,1.0,0.0,0.0
2023-09-27,1.45,2.71,-0.38,0.124,0.112,-1.31,-0.608,-1.41,-0.566,-1.57,...,5.047,2.33,-1.23,1.616,0.99,0.000,0.00,0.0,0.0,0.0
2023-10-04,2.29,2.67,-0.12,0.075,0.078,0.09,-1.188,-0.06,-1.255,-1.87,...,11.061,2.96,2.61,-3.772,0.90,0.000,0.00,1.0,1.0,1.0
2023-10-05,0.65,2.38,-0.54,0.059,0.056,-2.41,0.734,-2.22,0.629,1.35,...,-10.328,2.58,7.52,-8.517,1.43,0.261,-0.01,1.0,0.0,0.0


In [18]:
from sklearn.base import clone
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# SMOTE를 사용하여 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())  # y_train을 1D array로 변환

# 모델별 하이퍼파라미터 그리드 설정
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 'scale']
}

param_grid_logreg = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# 모델 목록
models = {
    'SVC': [SVC(), param_grid_svc],
    'Logistic Regression': [LogisticRegression(), param_grid_logreg]
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 GridSearchCV 실행 및 평가
for name, (model, params) in models.items():
    clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
    clf.fit(X_train_res, y_train_res)
    
    # 최적의 하이퍼파라미터 출력
    print(f"Best parameters for {name}: {clf.best_params_}")
    
    # 최적의 하이퍼파라미터로 모델 생성
    best_model = clone(model).set_params(**clf.best_params_)
    best_model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = best_model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = best_model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)


Best parameters for SVC: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
                 Model  Accuracy  Precision  Recall  F1 Score
0                  SVC  0.578947        0.0   0.000  0.000000
1  Logistic Regression  0.631579        1.0   0.125  0.222222


In [19]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)

                 Model  Accuracy  Precision  Recall  F1 Score
0                  SVC  0.578947        0.0   0.000  0.000000
1  Logistic Regression  0.631579        1.0   0.125  0.222222


In [20]:
train_df

Unnamed: 0,시장베이시스,이론베이시스,괴리율,원위안 야간변동율,원엔 야간변동율,KOSPI 전날변동율,KOSPI 야간변동율,KS200 전날변동율,KS200 야간변동율,NAS 야간변동율,...,VIX 당일변동율,VIX 당일변화량,JNIV 종가변동율,JNIV 전날변동율,JNIV 전날변화량,CD 전날변동율,CD 전날변화량,VKOSPI_Label,SVC_pred,Logistic Regression_pred
2023-04-10,1.49,1.97,-0.15,0.037,0.029,1.27,0.051,1.74,-0.028,0.76,...,0.000,0.00,0.57,1.257,0.46,-1.397,0.05,1.0,1.0,0.0
2023-04-11,1.57,1.86,-0.09,0.042,0.038,0.87,0.448,0.93,0.496,-0.03,...,-2.166,1.12,-2.37,0.698,0.47,-0.567,0.02,0.0,0.0,1.0
2023-04-12,1.28,1.83,-0.17,0.036,0.051,1.42,-0.059,1.38,0.003,-0.43,...,0.105,0.72,-0.92,2.758,0.74,-0.855,0.03,1.0,1.0,1.0
2023-04-13,1.36,1.79,-0.13,0.062,0.044,0.11,-0.625,0.37,-0.611,-0.85,...,-1.496,1.73,-1.40,-0.118,0.27,-1.437,0.05,0.0,0.0,0.0
2023-04-14,1.21,1.75,-0.16,0.058,0.054,0.43,0.720,0.30,0.700,1.99,...,-5.470,1.29,-0.77,-1.411,0.65,0.000,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-26,1.52,1.96,-0.13,0.137,0.090,-0.49,-0.085,-0.13,-0.021,0.45,...,-2.029,1.62,-2.30,0.848,0.62,0.000,0.00,1.0,1.0,0.0
2023-09-27,1.45,2.71,-0.38,0.124,0.112,-1.31,-0.608,-1.41,-0.566,-1.57,...,5.047,2.33,-1.23,1.616,0.99,0.000,0.00,0.0,0.0,0.0
2023-10-04,2.29,2.67,-0.12,0.075,0.078,0.09,-1.188,-0.06,-1.255,-1.87,...,11.061,2.96,2.61,-3.772,0.90,0.000,0.00,1.0,1.0,1.0
2023-10-05,0.65,2.38,-0.54,0.059,0.056,-2.41,0.734,-2.22,0.629,1.35,...,-10.328,2.58,7.52,-8.517,1.43,0.261,-0.01,1.0,1.0,0.0
