# Feature Selection
1. Wrapper
2. Filter
3. Embedded

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import matplotlib.font_manager as fm

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# 라이브러리 호출
import pandas as pd
from sklearn.model_selection import train_test_split

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

In [4]:
# plt.rc('font', family='Malgun Gothic') # 폰트 지정
# plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
# %config InlineBackend.figure_format='retina' # 그래프 글씨 뚜렷

In [6]:
train_df = pd.read_csv('../DATA/half_features.csv',index_col=0)
test_df = pd.read_csv('../DATA/test_features.csv',index_col=0)
train_df.columns

Index(['시장베이시스', '이론베이시스', '괴리율', '원위안 야간변동율', '원엔 야간변동율', 'KOSPI 전날변동율',
       'KOSPI 야간변동율', 'KS200 전날변동율', 'KS200 야간변동율', 'NAS 야간변동율', 'NAS 당일변동율',
       'NAS 당일변화량', 'P/C Ratio', 'CALL_vol_change(%)', 'PUT_vol_change(%)',
       'CALL_vol_fluc', 'PUT_vol_fluc', 'S&P 야간변동율', 'S&P 당일변동율', 'S&P 당일변화량',
       '원달러 야간변동율', 'VIX 전날변동율', 'VIX 당일변동율', 'VIX 당일변화량', 'JNIV 종가변동율',
       'JNIV 전날변동율', 'JNIV 전날변화량', 'CD 전날변동율', 'CD 전날변화량', 'VKOSPI_Label'],
      dtype='object')

In [5]:
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

In [8]:
# 표준화
features_to_standardize = ['시장베이시스', '이론베이시스','KOSPI 전날변동율','KS200 전날변동율','NAS 야간변동율','CALL_vol_change(%)','CALL_vol_fluc', 'S&P 당일변화량','VIX 전날변동율', 'VIX 당일변화량']
scaler_standardize = StandardScaler()
X_train = scaler_standardize.fit_transform(X_train[features_to_standardize])
X_val = scaler_standardize.transform(X_val[features_to_standardize])

X_train = pd.DataFrame(X_train, columns=features_to_standardize)
X_val = pd.DataFrame(X_val, columns=features_to_standardize)

## RF 기반 feature importance

In [9]:
# Feature_Importances
rfc = RandomForestClassifier(random_state=42)

rfc.fit(X_train, y_train)

rfc.feature_importances_

ftr_importances = pd.Series(rfc.feature_importances_, index=X_train.columns)
sorted_feature_importance = ftr_importances.sort_values(ascending=True)

data = sorted_feature_importance
importance = pd.DataFrame(data, columns=['feature importances'])
importance

Unnamed: 0,feature importances
시장베이시스,0.055714
VIX 당일변화량,0.063004
VIX 전날변동율,0.066598
S&P 당일변화량,0.071605
이론베이시스,0.072061
NAS 야간변동율,0.072941
CALL_vol_fluc,0.123761
CALL_vol_change(%),0.137667
KS200 전날변동율,0.158996
KOSPI 전날변동율,0.177653


# Logit기반 피쳐 갯수 선정

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()


logit = SelectFromModel(LogisticRegression())
logit.fit(X_train, y_train)
logit_support = logit.get_support()
lr_feature = X_train.loc[:,logit_support].columns.tolist()

In [11]:
print(lr_feature)
print(len(lr_feature))

['KOSPI 전날변동율', 'CALL_vol_change(%)', 'S&P 당일변화량', 'VIX 당일변화량']
4


## 1. Wrapper : 모델링을 돌리면서 변수를 선택하는 방법

- Forward Selection(전진 선택) : 변수가 없는 상태로 시작하며 반복할 때마다 가장 중요한 변수를 추가하여 더 이상 성능의 향상이 없을 때까지 변수를 추가한다.
- Backward Elimination(후방 제거) : 모든 변수를 가지고 시작하며, 가장 덜 중요한 변수를 하나씩 제거하면서 모델의 성능을 향상시킨다. 더 이상 성능의 향상이 없을 때까지 반복한다.
- Stepwise Selection(단계별 선택): Foward Selection 과 Backward Elimination 을 결합하여 사용하는 방식으로, 모든  변수를 가지고 시작하여 가장 도움이 되지 않는 변수를 삭제하거나, 모델에서 빠져있는 변수 중에서 가장 중요한 변수를 추가하는 방법이다. 이와 같이 변수를 추가 또는 삭제를 반복한다. 반대로 아무것도 없는 모델에서 출발해 변수를 추가, 삭제를 반복할 수도 있다.

In [12]:
# Forward Selection
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Forward feature selection 수행
selected_features = []
best_score = 0

while len(selected_features) < train_df.shape[1]:
    best_feature = None
    best_model = None
    best_score_local = 0

    for feature in X_train.columns:
        if feature not in selected_features:
            features = selected_features + [feature]
            X_train_selected = X_train[features]
            X_val_selected = X_val[features]

            model = LogisticRegression()
            model.fit(X_train_selected, y_train)
            score = model.score(X_val_selected, y_val)

            if score > best_score_local:
                best_score_local = score
                best_feature = feature
                best_model = model

    if best_score_local > best_score:
        selected_features.append(best_feature)
        best_score = best_score_local
        print(f"Selected feature: {best_feature}, Accuracy: {best_score:.4f}")

    else:
        break

print("\nForward selected features:")
Forward = selected_features


Selected feature: CALL_vol_fluc, Accuracy: 0.5789
Selected feature: NAS 야간변동율, Accuracy: 0.6316

Forward selected features:


In [15]:
# Backward Elimination
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Backward feature selection 수행
selected_features = X_train.columns.tolist()
best_score = 0

while len(selected_features) > 0:
    worst_feature = None
    best_model = None
    best_score_local = 0

    for feature in selected_features:
        features = selected_features.copy()
        features.remove(feature)

        X_train_selected = X_train[features]
        X_val_selected = X_val[features]

        model = LogisticRegression()
        model.fit(X_train_selected, y_train)
        score = model.score(X_val_selected, y_val)

        if score > best_score_local:
            best_score_local = score
            worst_feature = feature
            best_model = model

    if best_score_local > best_score:
        selected_features.remove(worst_feature)
        best_score = best_score_local
        print(f"Removed feature: {worst_feature}, Accuracy: {best_score:.4f}")

    else:
        break

print("\nFinal selected features:")
Backward = selected_features


Removed feature: KOSPI 전날변동율, Accuracy: 0.4737
Removed feature: KS200 전날변동율, Accuracy: 0.5263
Removed feature: 시장베이시스, Accuracy: 0.5789

Final selected features:


In [13]:
# Stepwise Selection
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Stepwise feature selection 수행
selected_features = []
best_score = 0

# Forward step
while len(selected_features) < train_df.shape[1]:
    best_feature = None
    best_model = None
    best_score_local = 0

    for feature in X_train.columns:
        if feature not in selected_features:
            features = selected_features + [feature]
            X_train_selected = X_train[features]
            X_val_selected = X_val[features]

            model = LogisticRegression()
            model.fit(X_train_selected, y_train)
            score = model.score(X_val_selected, y_val)

            if score > best_score_local:
                best_score_local = score
                best_feature = feature
                best_model = model

    if best_score_local > best_score:
        selected_features.append(best_feature)
        best_score = best_score_local
        print(f"Selected feature: {best_feature}, Accuracy: {best_score:.4f}")

    else:
        break

# Backward step
while len(selected_features) > 0:
    worst_feature = None
    best_model = None
    best_score_local = 0

    for feature in selected_features:
        features = selected_features.copy()
        features.remove(feature)

        X_train_selected = X_train[features]
        X_val_selected = X_val[features]

        model = LogisticRegression()
        model.fit(X_train_selected, y_train)
        score = model.score(X_val_selected, y_val)

        if score > best_score_local:
            best_score_local = score
            worst_feature = feature
            best_model = model

    if best_score_local > best_score:
        selected_features.remove(worst_feature)
        best_score = best_score_local
        print(f"Removed feature: {worst_feature}, Accuracy: {best_score:.4f}")

    else:
        break

print("\nFinal selected features:")
Stepwise = selected_features


Selected feature: CALL_vol_fluc, Accuracy: 0.5789
Selected feature: NAS 야간변동율, Accuracy: 0.6316

Final selected features:


## 2. Filter
- 통계기법 사용하여 변수를 선택하는 방법
- 전처리단에서 주로 사용할 만 하며 통계기법을 사용하여 상관관계가 높은 변수나, 성능이 높은 변수를 추출하는 방법

In [18]:
# H가 1인 경우 이분산성 / H가 0인 경우 등분산
from scipy.stats import bartlett
def bartlett_test(col, p_value = 0.05, H = 1):
    list= []
    for i in col:
        T, p_val =bartlett(train_df[train_df['VKOSPI_Label']==1][i], train_df[train_df['VKOSPI_Label']==0][i]) 
        list.append([i, p_val])

    list = pd.DataFrame(list, columns = ['변수', 'p_value'])
    if H == 1:
        a = list[(list['p_value'] < p_value)][['변수', 'p_value']].sort_values('p_value')
        return a
    else:
        a = list[(list['p_value'] >= p_value)][['변수', 'p_value']].sort_values('p_value')
        return a

In [19]:
# 이분산성 변수
x_hetero = bartlett_test(train_df.columns, H = 1)
# 등분산성 변수
x_homo = bartlett_test(train_df.columns, H = 0)

In [24]:
# t-test

import scipy.stats as stats
def t_test(col, col_h0, col_h1, p_value = 0.05):
    list= []
    for i in col:
        if (col_h0['변수']==i).any():
            t_stat, p_val = stats.ttest_ind(train_df[train_df['VKOSPI_Label']==1][i], train_df[train_df['VKOSPI_Label']==0][i], equal_var=True) # 등분산성 : wald t-test
            list.append([i, p_val])
        elif (col_h1['변수']==i).any():
            t_stat, p_val = stats.ttest_ind(train_df[train_df['VKOSPI_Label']==1][i], train_df[train_df['VKOSPI_Label']==0][i], equal_var=False) # 이분산성 : welch’s t-test
            list.append([i, p_val])

    list = pd.DataFrame(list, columns = ['변수', 'p_value'])
    a = list[(list['p_value'] < p_value)][['변수', 'p_value']].sort_values('p_value')
    return a

In [25]:
# 2) t_test 결과 p_value < 0.05보다 작은 유의한 변수 가져오기
x_ttest = t_test(train_df.columns, x_homo, x_hetero, p_value=0.1)
print("유의한 피쳐 수 :", len(x_ttest))
x_ttest.sort_values(by="변수", ascending=True)

유의한 피쳐 수 : 13


Unnamed: 0,변수,p_value
13,CALL_vol_change(%),0.000454
15,CALL_vol_fluc,0.007152
5,KOSPI 전날변동율,0.000108
7,KS200 전날변동율,0.000459
11,NAS 당일변화량,0.06215
9,NAS 야간변동율,0.004014
14,PUT_vol_change(%),0.000395
16,PUT_vol_fluc,0.002436
18,S&P 당일변동율,0.034863
17,S&P 야간변동율,0.001775


## 3. Embedded
- Lasso, Ridge, Elastic Net 등 내장함수 사용하여 변수를 선택하는 방법

In [26]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.simplefilter('ignore')

lasso_model = LogisticRegression()
param_grid = {'penalty' : ['l1'], 
                'C' : [0.001, 0.01, 0.1, 1, 2, 5, 10],
                'solver' : ['liblinear']}

grid_search = GridSearchCV(lasso_model, param_grid=param_grid, return_train_score=True, cv=5)
grid_search.fit(X_train, y_train)

df = pd.DataFrame(grid_search.cv_results_)
df = df.sort_values(by=['rank_test_score'], ascending=True)
df[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
print('GridSearchCV 최적 파라미터:', grid_search.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(grid_search.best_score_))

GridSearchCV 최적 파라미터: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
GridSearchCV 최고 정확도:0.7687


In [27]:
lasso_best = LogisticRegression(C=1, penalty='l1', solver='liblinear').fit(X_train, y_train)

df_lasso = pd.DataFrame()
df_lasso['feature'] = X_train.columns
df_lasso['coef'] = lasso_best.coef_[0]
df_lasso.drop(df_lasso[df_lasso['coef']==0].index, inplace=True)
df_lasso

Unnamed: 0,feature,coef
1,이론베이시스,0.062837
2,KOSPI 전날변동율,-0.805535
4,NAS 야간변동율,-0.293096
5,CALL_vol_change(%),0.698586
7,S&P 당일변화량,-0.286733
8,VIX 전날변동율,0.085042
9,VIX 당일변화량,0.320272


In [28]:
# 라쏘에서 선택된 피처
lasso = df_lasso['feature'].values.tolist()
print('Lasso에서 선택된 피처 수 {0:1.0f}'.format(len(df_lasso)), '개')
lasso

Lasso에서 선택된 피처 수 7 개


['이론베이시스',
 'KOSPI 전날변동율',
 'NAS 야간변동율',
 'CALL_vol_change(%)',
 'S&P 당일변화량',
 'VIX 전날변동율',
 'VIX 당일변화량']

In [29]:
list_Forward = list(Forward)
list_Backward = list(Backward)
list_Stepwise = list(Stepwise)
list_lasso = list(lasso)
list_col_all = X_train.columns

In [30]:
def func_Forward(x):
    if x in list_Forward:
        return 1
    else:
        return 0
    
def func_Backward(x):
    if x in list_Backward:
        return 1
    else:
        return 0


def func_Stepwise(x):
    if x in list_Stepwise:
        return 1
    else:
        return 0

    
def func_lasso(x):
    if x in list_lasso:
        return 1
    else:
        return 0

In [32]:
# 2번 선택된 Feature

feature_counts = pd.DataFrame()
feature_counts['Feature'] = list_col_all
feature_counts['Forward'] = list_col_all.map(func_Forward)
feature_counts['Backward'] = list_col_all.map(func_Backward)
feature_counts['Stepwise'] = list_col_all.map(func_Stepwise)
feature_counts['lasso'] = list_col_all.map(func_lasso)

feature_counts["total"] = feature_counts["Forward"]+feature_counts['Backward']+feature_counts['Stepwise']+feature_counts["lasso"]
feature_final = feature_counts[feature_counts["total"]>=2]
list_feature_final = list(feature_final["Feature"])
print("선택된 피쳐수 :", len(list_feature_final))
feature_final

선택된 피쳐수 : 7


Unnamed: 0,Feature,Forward,Backward,Stepwise,lasso,total
1,이론베이시스,0,1,0,1,2
4,NAS 야간변동율,1,1,1,1,4
5,CALL_vol_change(%),0,1,0,1,2
6,CALL_vol_fluc,1,1,1,0,3
7,S&P 당일변화량,0,1,0,1,2
8,VIX 전날변동율,0,1,0,1,2
9,VIX 당일변화량,0,1,0,1,2


In [33]:
np.array(feature_final[['Feature']])


array([['이론베이시스'],
       ['NAS 야간변동율'],
       ['CALL_vol_change(%)'],
       ['CALL_vol_fluc'],
       ['S&P 당일변화량'],
       ['VIX 전날변동율'],
       ['VIX 당일변화량']], dtype=object)

In [34]:
# 3번 선택된 Feature

feature_counts = pd.DataFrame()
feature_counts['Feature'] = list_col_all
feature_counts['Forward'] = list_col_all.map(func_Forward)
feature_counts['Backward'] = list_col_all.map(func_Backward)
feature_counts['Stepwise'] = list_col_all.map(func_Stepwise)
feature_counts['lasso'] = list_col_all.map(func_lasso)

feature_counts["total"] = feature_counts["Forward"]+feature_counts['Backward']+feature_counts['Stepwise']+feature_counts["lasso"]
feature_final = feature_counts[feature_counts["total"]>=3]
list_feature_final = list(feature_final["Feature"])
print("선택된 피쳐수 :", len(list_feature_final))
feature_final

선택된 피쳐수 : 2


Unnamed: 0,Feature,Forward,Backward,Stepwise,lasso,total
4,NAS 야간변동율,1,1,1,1,4
6,CALL_vol_fluc,1,1,1,0,3


In [36]:
X_train

Unnamed: 0,시장베이시스,이론베이시스,KOSPI 전날변동율,KS200 전날변동율,NAS 야간변동율,CALL_vol_change(%),CALL_vol_fluc,S&P 당일변화량,VIX 전날변동율,VIX 당일변화량
0,0.566404,0.773694,1.661789,2.160427,0.669069,0.697575,0.664353,-0.071221,-0.045017,-1.860177
1,0.664680,0.629081,1.144982,1.151933,-0.111934,1.434295,1.525999,-0.106271,0.527964,-0.147601
2,0.308431,0.589641,1.855591,1.712207,-0.507378,0.755972,1.262660,-1.118097,0.082518,-0.759235
3,0.406706,0.537055,0.163050,0.454702,-0.922595,-0.046177,0.199849,0.586797,-0.054258,0.785141
4,0.222440,0.484469,0.576495,0.367548,1.885061,-0.488326,-0.579619,0.813631,-1.294485,0.112344
...,...,...,...,...,...,...,...,...,...,...
116,0.603257,0.760547,-0.612159,-0.167825,0.362600,-0.100931,0.034904,-0.181662,-0.366625,0.616942
117,0.517266,1.746542,-1.671612,-1.761495,-1.634395,-0.317033,-0.092544,0.560344,2.185913,1.702593
118,1.549160,1.693956,0.137210,-0.080671,-1.930979,0.844460,0.570437,1.728903,2.232121,2.665917
119,-0.465489,1.312704,-3.092830,-2.769989,1.252350,0.383877,0.394884,0.625815,-1.166951,2.084864


In [37]:
from sklearn.model_selection import GridSearchCV

def model_basic_with_gridsearch(X_train, y_train, X_val, y_val):
    # 모델 및 해당 하이퍼파라미터 그리드 정의
    models_and_grids = {
        LogisticRegression(): {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10]
        },
        DecisionTreeClassifier(): {
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        SVC(): {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        RandomForestClassifier(): {
            'n_estimators': [5,10,15,20],
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10]
        },
        XGBClassifier(): {
            'n_estimators': [5,10,15,20],
            'learning_rate': [0.01, 0.1, 0.5]
        },
        LGBMClassifier(): {
            'n_estimators': [5,10,15,20],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    }

    rdict = {'model': [], 'best_params': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}

    for model, grid in models_and_grids.items():
        clf = GridSearchCV(model, grid, cv=5)
        clf.fit(X_train, y_train)
        best_model = clf.best_estimator_

        pred = best_model.predict(X_val)
        results = (round(accuracy_score(y_val, pred), 2),
                   round(precision_score(y_val, pred, average='weighted'), 2),
                   round(recall_score(y_val, pred, average='weighted'), 2),
                   round(f1_score(y_val, pred, average='weighted'), 2))

        rdict['model'].append(best_model.__class__.__name__)
        rdict['best_params'].append(clf.best_params_)
        rdict['accuracy'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])

    rdf = pd.DataFrame(data=rdict)
    return rdf


In [38]:
model_basic_with_gridsearch(X_train, y_train, X_val, y_val)

[LightGBM] [Info] Number of positive: 27, number of negative: 69
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.281250 -> initscore=-0.938270
[LightGBM] [Info] Start training from score -0.938270
[LightGBM] [Info] Number of positive: 28, number of negative: 69
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 329
[LightGBM] [Info] Number of data points in the train set: 97, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.288660 -> initscore=-0.901902
[LightGBM] [Info] Start training from score -0.901902
[LightGBM] [Info] Number of posi

Unnamed: 0,model,best_params,accuracy,precision,recall,f1_score
0,LogisticRegression,"{'C': 0.1, 'penalty': 'l2'}",0.42,0.41,0.42,0.39
1,DecisionTreeClassifier,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.37,0.37,0.37,0.36
2,SVC,"{'C': 1, 'kernel': 'rbf'}",0.47,0.48,0.47,0.46
3,RandomForestClassifier,"{'max_depth': 7, 'min_samples_split': 2, 'n_es...",0.37,0.36,0.37,0.35
4,XGBClassifier,"{'learning_rate': 0.5, 'n_estimators': 20}",0.32,0.31,0.32,0.3
5,LGBMClassifier,"{'learning_rate': 0.1, 'n_estimators': 15}",0.37,0.36,0.37,0.35
