# Feature Selection
1. Wrapper
2. Filter
3. Embedded

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import matplotlib.font_manager as fm

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# 라이브러리 호출
import pandas as pd
from sklearn.model_selection import train_test_split

# 경고창 제거
import warnings
warnings.filterwarnings('ignore')

In [4]:
# plt.rc('font', family='Malgun Gothic') # 폰트 지정
# plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
# %config InlineBackend.figure_format='retina' # 그래프 글씨 뚜렷

In [5]:
train_df = pd.read_csv('../../DATA/quater_features.csv',index_col=0)
test_df = pd.read_csv("../../DATA/test_features.csv",index_col=0)

In [6]:
X_train = train_df.drop(columns='VKOSPI_Label',axis=1)
y_train = train_df[['VKOSPI_Label']]
X_val = test_df.drop(columns='VKOSPI_Label',axis=1)
y_val = test_df[['VKOSPI_Label']]

In [7]:
# 표준화
features_to_standardize = ['KOSPI 전날변동율','KS200 전날변동율','NAS 야간변동율','CALL_vol_change(%)','CALL_vol_fluc', 'S&P 당일변화량','VIX 전날변동율', 'VIX 당일변화량']
scaler_standardize = StandardScaler()
X_train = scaler_standardize.fit_transform(X_train[features_to_standardize])
X_val = scaler_standardize.transform(X_val[features_to_standardize])

X_train = pd.DataFrame(X_train, columns=features_to_standardize)
X_val = pd.DataFrame(X_val, columns=features_to_standardize)

In [8]:
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler

# OneSidedSelection 적용
undersampler = OneSidedSelection(sampling_strategy='majority', random_state=42)
X_train, y_train = undersampler.fit_resample(X_train, y_train)

# 1:1 비율로 데이터셋을 다시 조정합니다.
custom_undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_train, y_train = custom_undersampler.fit_resample(X_train, y_train)

In [9]:
X_train=pd.concat([X_train,y_train],axis=1,ignore_index=True)

In [10]:
X_val = X_val.set_index(y_val.index)

X_val = pd.concat([X_val,y_val],axis=1,ignore_index=True)

In [12]:
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

#### 하이퍼파라미터 조정해서 출력해보기

In [13]:
from sklearn.model_selection import GridSearchCV

def get_model_predictions(model, grid, X_train, y_train, X_test):
    clf = GridSearchCV(model, grid, cv=5)
    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_
    pred = best_model.predict(X_test)
    return pred

def model_basic_with_gridsearch(X_train, y_train, X_test, y_test):
    # 모델 및 해당 하이퍼파라미터 그리드 정의
    models_and_grids = {
        LogisticRegression(): {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10]
        },
        DecisionTreeClassifier(): {
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        SVC(): {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        },
        RandomForestClassifier(): {
            'n_estimators': [5,10,15,20],
            'max_depth': [1,3,7,10],
            'min_samples_split': [2, 5, 10]
        },
        XGBClassifier(): {
            'n_estimators': [5,10,15,20,50,100],
            'learning_rate': [0.01, 0.1, 0.5]
        },
        LGBMClassifier(): {
            'n_estimators': [5,10,15,20,50,100],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    }

    model_predictions = {}  # 각 모델의 예측값을 저장하는 딕셔너리

    for model, grid in models_and_grids.items():
        pred = get_model_predictions(model, grid, X_train, y_train, X_test)
        model_predictions[model.__class__.__name__] = pred

    return model_predictions

In [14]:
result = model_basic_with_gridsearch(X_train, y_train, X_val, y_val)

[LightGBM] [Info] Number of positive: 19, number of negative: 19
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 38, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19, number of negative: 19
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 38, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 19, number of negative: 19
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 38, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 20, number of negative: 19
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 39, number of used features: 0
[LightGBM] [Info] [bina

In [15]:
# 각 모델의 예측 결과를 데이터프레임으로 변환
dfs = [pd.DataFrame({model_name: predictions}) for model_name, predictions in result.items()]

# 데이터프레임들을 하나로 합치기
result_df = pd.concat(dfs, axis=1)

# 결과 데이터프레임 출력
print(result_df)

    LogisticRegression  DecisionTreeClassifier  SVC  RandomForestClassifier  \
0                  1.0                     1.0  1.0                     1.0   
1                  1.0                     1.0  1.0                     1.0   
2                  1.0                     1.0  1.0                     1.0   
3                  0.0                     0.0  0.0                     0.0   
4                  0.0                     0.0  0.0                     0.0   
5                  0.0                     0.0  0.0                     0.0   
6                  1.0                     1.0  1.0                     1.0   
7                  0.0                     0.0  0.0                     0.0   
8                  0.0                     0.0  0.0                     1.0   
9                  1.0                     1.0  1.0                     1.0   
10                 1.0                     1.0  1.0                     1.0   
11                 0.0                     0.0  0.0 

In [16]:
# result_df = result_df.set_index(test_df.index)

# result_2month= pd.concat([test_df,result_df],axis=1)

# import os
# os.chdir("C:/Users/LGPC/Desktop/onproject/Project1/final_modeling/result_modeling")
# result_2month.to_csv("C:/Users/LGPC/Desktop/oncoding/Project1/dataset/result_2month.to_csv.csv")

In [17]:
result_df['DecisionTreeClassifier'].value_counts()

DecisionTreeClassifier
1.0    10
0.0     9
Name: count, dtype: int64