### 1. 필요한 라이브러리 호출 및 시각화 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
%config InlineBackend.figure_format = 'retina'


### 2. 데이터셋 불러오기

In [2]:
train_df = pd.read_csv('../Data/quarter_features.csv', index_col=0)
test_df = pd.read_csv("../Data/test_features.csv", index_col=0)

### 3. 데이터셋 분리

In [3]:
# train/test셋 분리
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

* Feature importance를 통해 
1. 'VIX 당일변화량'과 'VIX 전날변동율' 중 중요도 낮은 피처 삭제 -> VIX 전날변동율 제거
2. '이론베이시스'와 '시장베이시스' 중 중요도 낮은 피처 삭제 -> 시장베이시스 제거
3. 중요도 값이 낮은 CALL_vol_fluc도 함께 삭제하기로 함.

selected_feature = ['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']

In [4]:
X_train = X_train[['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']]
X_val = X_val[['이론베이시스','VIX 당일변화량', 'NAS 야간변동율', 'S&P 당일변화량','CALL_vol_change(%)','KS200 전날변동율','KOSPI 전날변동율']]

### 5. 오버샘플링 및 모델링

### 모델링
1. 트리기반

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# SMOTE를 사용하여 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델 초기화
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

# 모델 목록
models = {
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'XGBoost': xgb_model,
    'LightGBM': lgbm_model
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 학습 및 평가
for name, model in models.items():
    # 모델 학습
    model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)

[LightGBM] [Info] Number of positive: 62, number of negative: 62
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
           Model  Accuracy  Precision  Recall  F1 Score
0  Decision Tree  0.684211   0.428571     0.6  0.500000
1  Random Forest  0.684211   0.400000     0.4  0.400000
2        XGBoost  0.684211   0.400000     0.4  0.400000
3       LightGBM  0.631579   0.375000     0.6  0.461538


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

# 데이터 오버샘플링
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

# 모델별 하이퍼파라미터 그리드 설정
param_grid_dtree = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_rforest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 모델 목록
models = {
    'Decision Tree': [DecisionTreeClassifier(random_state=42), param_grid_dtree],
    'Random Forest': [RandomForestClassifier(random_state=42), param_grid_rforest],
    'XGBoost': [XGBClassifier(random_state=42), param_grid_xgb],
    'LightGBM': [LGBMClassifier(random_state=42), param_grid_lgbm]
}

# 결과를 저장할 리스트 초기화
results = []

# 각 모델에 대해 GridSearchCV 실행 및 평가
for name, (model, params) in models.items():
    clf = GridSearchCV(model, params, scoring='accuracy', cv=5)
    clf.fit(X_train_res, y_train_res)
    
    # 최적의 하이퍼파라미터 출력
    print(f"Best parameters for {name}: {clf.best_params_}")
    
    # 최적의 하이퍼파라미터로 모델 생성
    best_model = clone(model)
    best_model.set_params(**clf.best_params_)
    best_model.fit(X_train_res, y_train_res)
    
    # 검증 데이터셋에 대한 예측
    predictions = best_model.predict(X_val)
    
    # 성능 지표 계산
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions, average='binary')
    recall = recall_score(y_val, predictions, average='binary')
    f1 = f1_score(y_val, predictions, average='binary')
    
    # 결과 저장
    results.append([name, accuracy, precision, recall, f1])
    
    # train 데이터셋에 대한 예측 결과를 저장
    train_df[name + '_pred'] = best_model.predict(X_train)

# 결과 DataFrame 생성 및 출력
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)
print(clf.best_params_)


Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 2}
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
[LightGBM] [Info] Number of positive: 50, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505051 -> initscore=0.020203
[LightGBM] [Info] Start training from score 0.020203
[LightGBM] [Info] Number of positive: 50, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins

In [7]:
import matplotlib.pyplot as plt
import pandas as pd


# Plotting the scatter plot
plt.figure(figsize=(10, 6))  # Set the figure size (optional)

# Loop through the unique labels and plot each subset of the data.
for label in y_train_res['VKOSPI_Label'].unique():
    # Filter the data based on the label
    subset = y_train_res[y_train_res['VKOSPI_Label'] == label]
    
    # Plot the data points for this label
    plt.scatter(subset['x_column'], subset['y_column'], label=label)

# Add a legend to the plot
plt.legend(title='Labels')

# Add titles and labels
plt.title('Scatter Plot of VKOSPI_Label')
plt.xlabel('X Axis Label')
plt.ylabel('Y Axis Label')

# Display the plot
plt.show()


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

<Figure size 1000x600 with 0 Axes>

### 임계값 설정을 위한 혼동행렬

In [None]:
# 성능확인 코드
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score


import warnings

def get_clf_eval(MDA_y_test, pred):
    confusion = confusion_matrix(MDA_y_test, pred)
    accuracy = accuracy_score(MDA_y_test, pred)
    precision = precision_score(MDA_y_test, pred)
    recall = recall_score(MDA_y_test, pred)
    roc_score = roc_auc_score(MDA_y_test, pred)
    pr_score = average_precision_score(MDA_y_test, pred)
    f1 = f1_score(MDA_y_test, pred)
    print('오차행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도 : {1:.4f}, 재현율:{2:.4f},F1 스코어:{3:.4f}'.format(accuracy, precision, recall, f1, roc_score))
    print('ROC 스코어: {0:.4f}, PR score : {1:.4f}'.format(roc_score, pr_score))

In [None]:
# 임계값에 따른 오차행렬및 스코어 -------->#임계값 최적 : 재현율기준(0.1)/f1기준(0.3)
from sklearn.preprocessing import Binarizer
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]

def get_eval_by_threshold(MDA_y_test, pred_proba_c1, thresholds):
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print("임곗값:", custom_threshold)
        get_clf_eval(MDA_y_test, custom_predict)
        print("---------------------------------------------------------")

In [None]:
decision_tree_model.fit(X_train, y_train)
lda_pred = decision_tree_model.predict(X_val)
lda_pred_proba = decision_tree_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

In [None]:
random_forest_model.fit(X_train, y_train)
lda_pred = random_forest_model.predict(X_val)
lda_pred_proba = random_forest_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

In [None]:
lgbm_model.fit(X_train, y_train)
lda_pred = lgbm_model.predict(X_val)
lda_pred_proba = lgbm_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

In [None]:
xgb_model.fit(X_train, y_train)
lda_pred = xgb_model.predict(X_val)
lda_pred_proba = xgb_model.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)

2. 선형기반 모델링

In [None]:
from sklearn.model_selection import GridSearchCV

def model_basic_with_gridsearch(X_train, y_train, X_val, y_val):
    # 모델 및 해당 하이퍼파라미터 그리드 정의
    models_and_grids = {
        LogisticRegression(): {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10]
        },
        SVC(): {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }

    rdict = {'model': [], 'best_params': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}

    for model, grid in models_and_grids.items():
        clf = GridSearchCV(model, grid, cv=5)
        clf.fit(X_train, y_train)
        best_model = clf.best_estimator_

        pred = best_model.predict(X_val)
        results = (round(accuracy_score(y_val, pred), 2),
                   round(precision_score(y_val, pred, average='weighted'), 2),
                   round(recall_score(y_val, pred, average='weighted'), 2),
                   round(f1_score(y_val, pred, average='weighted'), 2))

        rdict['model'].append(best_model.__class__.__name__)
        rdict['best_params'].append(clf.best_params_)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])

    rdf = pd.DataFrame(data=rdict)
    return rdf


In [None]:
model_basic_with_gridsearch(X_train, y_train, X_val, y_val)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # Assuming ADASYN was a mistake since SMOTE is used in the code
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def model_basic(X_train, y_train, X_test, y_test):
    # 오버샘플링을 적용합니다.
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train.values.ravel())

    # 모델들을 초기화합니다.
    logistic_regression_model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')
    svc_model = SVC(C=10, gamma=1, kernel='rbf')

    # 모델 리스트를 생성합니다.
    models = [logistic_regression_model, svc_model]

    # 결과를 저장할 딕셔너리를 초기화합니다.
    results_dict = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}
    
    # 각 모델에 대해 훈련 및 평가를 실행합니다.
    for model in models:
        model.fit(X_train_res, y_train_res)  # 오버샘플링된 데이터로 훈련
        pred = model.predict(X_test)
        results = (round(accuracy_score(y_test, pred), 2),
                   round(precision_score(y_test, pred), 2),
                   round(recall_score(y_test, pred), 2),
                   round(f1_score(y_test, pred), 2))
        results_dict['model'].append(model.__class__.__name__)  # 모델의 이름을 저장
        results_dict['accuracy'].append(results[0])
        results_dict['precision'].append(results[1])
        results_dict['recall'].append(results[2])
        results_dict['f1_score'].append(results[3])

    # 결과 데이터프레임을 생성 및 반환합니다.
    results_df = pd.DataFrame(data=results_dict)
    return results_df


In [None]:
model_basic(X_train, y_train, X_val, y_val)

In [None]:
# SVC 모델을 생성
svc = SVC(C= 0.1, kernel ='linear',probability=True)  # probability=True로 설정하여 예측 확률을 얻을 수 있도록 함
svc.fit(X_train, y_train)

# 예측 및 예측 확률 계산
svc_pred = svc.predict(X_val)
svc_pred_proba = svc.predict_proba(X_val)

# 정확도 평가
get_clf_eval(y_val, svc_pred)
get_eval_by_threshold(y_val, lda_pred_proba[:,1].reshape(-1,1),thresholds)

In [None]:
lr = LogisticRegression(C = 0.01, penalty= 'l2')
lr.fit(X_train, y_train)
lda_pred = lr.predict(X_val)
lda_pred_proba = lr.predict_proba(X_val)

get_clf_eval(y_val, lda_pred)
get_eval_by_threshold(y_val , lda_pred_proba[:,1].reshape(-1,1), thresholds)