# Feature Selection
1. Wrapper
2. Filter
3. Embedded

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import matplotlib.font_manager as fm

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# 라이브러리 호출
import pandas as pd
from sklearn.model_selection import train_test_split

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

In [4]:
# plt.rc('font', family='Malgun Gothic') # 폰트 지정
# plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
# %config InlineBackend.figure_format='retina' # 그래프 글씨 뚜렷

In [5]:
train_df = pd.read_csv('../../DATA/quater_features.csv',index_col=0)
test_df = pd.read_csv("../../DATA/test_features.csv",index_col=0)

In [6]:
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

In [7]:
# val_index = X_val.index

In [8]:
# 표준화
features_to_standardize = ['KOSPI 전날변동율','KS200 전날변동율','NAS 야간변동율','CALL_vol_change(%)','CALL_vol_fluc', 'S&P 당일변화량','VIX 전날변동율', 'VIX 당일변화량']
scaler_standardize = StandardScaler()
X_train = scaler_standardize.fit_transform(X_train[features_to_standardize])
X_val = scaler_standardize.transform(X_val[features_to_standardize])

X_train = pd.DataFrame(X_train, columns=features_to_standardize)
X_val = pd.DataFrame(X_val, columns=features_to_standardize)

In [9]:
# Feature_Importances
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)

rfc.feature_importances_

ftr_importances = pd.Series(rfc.feature_importances_, index=X_train.columns)
sorted_feature_importance = ftr_importances.sort_values(ascending=True)

data = sorted_feature_importance
importance = pd.DataFrame(data, columns=['feature importances'])
importance

Unnamed: 0,feature importances
S&P 당일변화량,0.075232
VIX 당일변화량,0.076815
VIX 전날변동율,0.085707
NAS 야간변동율,0.109699
CALL_vol_change(%),0.110727
CALL_vol_fluc,0.118055
KOSPI 전날변동율,0.208312
KS200 전날변동율,0.215453


In [10]:
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

In [11]:
# model_with_adasyn(X_train, y_train, X_val, y_val)

In [12]:
def model_basic(X_train, y_train, X_test, y_test):

    # ADASYN 오버샘플링을 적용합니다.
    ada = ADASYN(sampling_strategy='minority', random_state=42)
    X_train, y_train = ada.fit_resample(X_train, y_train)

    models = [
        LogisticRegression(),
        DecisionTreeClassifier(),
        SVC(),
        RandomForestClassifier(),
        XGBClassifier(),
        LGBMClassifier()
    ]

    rdict={'model':[],'accuracy':[],'precision':[],'recall':[],'f1_score':[]}

    
    for clf in models:
        clf = clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        results = (round(accuracy_score(y_test, pred),2),
                    round(precision_score(y_test, pred),2),
                    round(recall_score(y_test, pred),2),
                    round(f1_score(y_test, pred),2))
        rdict['model'].append(clf); 
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])

        # print(results)

    rdf = pd.DataFrame(data=rdict)
    return rdf 

In [13]:
model_basic(X_train, y_train, X_val, y_val)

[LightGBM] [Info] Number of positive: 58, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 114, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508772 -> initscore=0.035091
[LightGBM] [Info] Start training from score 0.035091


Unnamed: 0,model,accuracy,precision,recall,f1_score
0,LogisticRegression(),0.37,0.38,0.3,0.33
1,DecisionTreeClassifier(),0.47,0.5,0.2,0.29
2,SVC(),0.53,0.6,0.3,0.4
3,"(DecisionTreeClassifier(max_features='sqrt', r...",0.42,0.4,0.2,0.27
4,"XGBClassifier(base_score=None, booster=None, c...",0.37,0.33,0.2,0.25
5,LGBMClassifier(),0.47,0.5,0.3,0.37


In [14]:
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd

def model_with_smote(X_train, y_train, X_val, y_val):
    # SMOTE 오버샘플링을 적용합니다.
    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # 모델 및 해당 하이퍼파라미터 그리드 정의
    models_and_grids = {
        LogisticRegression(): {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10]
        },
        DecisionTreeClassifier(): {
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        SVC(): {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        RandomForestClassifier(): {
            'n_estimators': [5,10,15,20],
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10]
        },
        XGBClassifier(): {
            'n_estimators': [5,10,15,20,50,100],
            'learning_rate': [0.01, 0.1, 0.5]
        },
        LGBMClassifier(): {
            'n_estimators': [5,10,15,20,50,100],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    }

    rdict = {'model': [], 'best_params': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'confusion_matrix': []}

    for model, grid in models_and_grids.items():
        clf = GridSearchCV(model, grid, cv=5)
        clf.fit(X_train_resampled, y_train_resampled)
        best_model = clf.best_estimator_

        pred = best_model.predict(X_val)
        results = (round(accuracy_score(y_val, pred), 2),
                   round(precision_score(y_val, pred, average='weighted'), 2),
                   round(recall_score(y_val, pred, average='weighted'), 2),
                   round(f1_score(y_val, pred, average='weighted'), 2))
        
        # 오차 행렬 계산
        cm = confusion_matrix(y_val, pred)
        
        rdict['model'].append(best_model.__class__.__name__)
        rdict['best_params'].append(clf.best_params_)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['confusion_matrix'].append(cm)

    rdf = pd.DataFrame(data=rdict)
    return rdf

In [15]:
model_with_smote(X_train, y_train, X_val, y_val)

[LightGBM] [Info] Number of positive: 45, number of negative: 44
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 248
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505618 -> initscore=0.022473
[LightGBM] [Info] Start training from score 0.022473
[LightGBM] [Info] Number of positive: 44, number of negative: 45
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494382 -> initscore=-0.022473
[LightGBM] [Info] Start training from score -0.022473
[LightGBM] [Info] Number of positive

Unnamed: 0,model,best_params,accuracy,precision,recall,f1_score,confusion_matrix
0,LogisticRegression,"{'C': 1, 'penalty': 'l2'}",0.37,0.37,0.37,0.36,"[[4, 5], [7, 3]]"
1,DecisionTreeClassifier,"{'max_depth': 3, 'min_samples_leaf': 1, 'min_s...",0.58,0.58,0.58,0.58,"[[5, 4], [4, 6]]"
2,SVC,"{'C': 10, 'kernel': 'rbf'}",0.53,0.59,0.53,0.47,"[[8, 1], [8, 2]]"
3,RandomForestClassifier,"{'max_depth': 7, 'min_samples_split': 2, 'n_es...",0.32,0.31,0.32,0.3,"[[4, 5], [8, 2]]"
4,XGBClassifier,"{'learning_rate': 0.1, 'n_estimators': 20}",0.32,0.31,0.32,0.3,"[[4, 5], [8, 2]]"
5,LGBMClassifier,"{'learning_rate': 0.1, 'n_estimators': 100}",0.37,0.37,0.37,0.36,"[[4, 5], [7, 3]]"
