## 다양한 피처선정 방법으로 피처 선택하기
- 상관관계 높았던 Top 10개 기준으로 차원 축소(Wrapper 방식) 진행

## 차원축소 기법(Feature Selection): Wrapper
- 전진 선택(Forward selection)
- 후진 제거(Backward elimination)
- Stepwise selection

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
# 파일 경로 설정
file_path = "data.csv"

# CSV 파일을 DataFrame으로 불러오기
data = pd.read_csv(file_path)

In [4]:
# 독립변수 종속변수 정리
X = data.drop(columns=['Bankrupt?']) 
y= data['Bankrupt?']

In [5]:
X

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.370594,0.424389,0.405750,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,0.780985,...,0.716845,0.009219,0.622879,0.601453,0.827890,0.290202,0.026601,0.564050,1,0.016469
1,0.464291,0.538214,0.516730,0.610235,0.610235,0.998946,0.797380,0.809301,0.303556,0.781506,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,0.426071,0.499019,0.472295,0.601450,0.601364,0.998857,0.796403,0.808388,0.302035,0.780284,...,0.774670,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,0.399844,0.451265,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.303350,0.781241,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,0.781550,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.035490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0.493687,0.539468,0.543230,0.604455,0.604462,0.998992,0.797409,0.809331,0.303510,0.781588,...,0.799927,0.000466,0.623620,0.604455,0.840359,0.279606,0.027064,0.566193,1,0.029890
6815,0.475162,0.538269,0.524172,0.598308,0.598308,0.998992,0.797414,0.809327,0.303520,0.781586,...,0.799748,0.001959,0.623931,0.598306,0.840306,0.278132,0.027009,0.566018,1,0.038284
6816,0.472725,0.533744,0.520638,0.610444,0.610213,0.998984,0.797401,0.809317,0.303512,0.781546,...,0.797778,0.002840,0.624156,0.610441,0.840138,0.275789,0.026791,0.565158,1,0.097649
6817,0.506264,0.559911,0.554045,0.607850,0.607850,0.999074,0.797500,0.809399,0.303498,0.781663,...,0.811808,0.002837,0.623957,0.607846,0.841084,0.277547,0.026822,0.565302,1,0.044009


In [None]:
# 훈련 데이터셋으로 분할
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=111)

# 로지스틱회귀모델 가지고오기
model=LogisticRegression(max_iter = 5000)


# 각선택법을 가지고 오는 방법
# SFS(모델, 피처선정 어떻게 할건지? best 기타 등등, forward , floating , scoring, cv)
sfs_forward=SFS(model, k_features= 'best', forward= True, floating=False, scoring='accuracy', cv=3)
sfs_backward=SFS(model, k_features= 'best', forward= False, floating=False, scoring='accuracy', cv=3)
sfs_stepwise=SFS(model, k_features= 'best', forward= True, floating=True, scoring='accuracy', cv=3)

# 각 방법을 3개 모두 다 학습 간단하게 fit
sfs_forward =sfs_forward.fit(X_train, y_train)
sfs_backward =sfs_backward.fit(X_train, y_train)
sfs_stepwise =sfs_stepwise.fit(X_train, y_train)

#선택된 특성 출력
#어떤 피처들이 선정되는지?
print('sfs_forward 선택된 특성')
print(sfs_forward.k_feature_names_)


print('sfs_backward 선택된 특성')
print(sfs_backward.k_feature_names_)


print('sfs_stepwise 선택된 특성')
print(sfs_stepwise.k_feature_names_)


#시각화로 실제 어떤 식으로 변화하는지 체크해 보자!
fig, ax = plt.subplots(1,3, figsize=(18,6))
ax[0].plot(range(1, len(sfs_forward.subsets_)+1),[sfs_forward.subsets_[i]['avg_score'] for i in sfs_forward.subsets_], marker='o')
ax[0].set_title('sfs_forward')

ax[1].plot(range(1, len(sfs_backward.subsets_)+1),[sfs_backward.subsets_[i]['avg_score'] for i in sfs_backward.subsets_], marker='o')
ax[1].set_title('sfs_backward')

ax[2].plot(range(1, len(sfs_stepwise.subsets_)+1),[sfs_stepwise.subsets_[i]['avg_score'] for i in sfs_stepwise.subsets_], marker='o')
ax[2].set_title('sfs_stepwise')

plt.tight_layout()
plt.show()

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
ABNORMAL_TERMINATION_IN_L

## RFE, RFECV
<b> RFE </b> : 반복적으로 훈련시키고, 가장 중요도가 낮은 특성을 하나씩 제거하는 방식, 몇 개 제거할지 등을 지정할 수 있다.


- RFE

In [6]:
X = data.drop(columns=['Bankrupt?'])
y = data['Bankrupt?']

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# RandomForest 모델
model_rf = RandomForestClassifier(n_estimators=100, random_state=111)

# RFE 모델 생성 및 학습
selector_rf = RFE(estimator=model_rf, n_features_to_select=15)
selector_rf = selector_rf.fit(X, y)

# 선택된 특성
selected_features_rf = [X.columns[i] for i in range(len(selector_rf.support_)) if selector_rf.support_[i]]
print('RandomForest RFE 선택된 특성:', selected_features_rf)

RandomForest RFE 선택된 특성: [' Non-industry income and expenditure/revenue', ' Interest-bearing debt interest rate', ' Net Value Per Share (B)', ' Persistent EPS in the Last Four Seasons', ' Net Value Growth Rate', ' Quick Ratio', ' Interest Expense Ratio', ' Total debt/Total net worth', ' Borrowing dependency', ' Net profit before tax/Paid-in capital', ' Accounts Receivable Turnover', ' Cash/Total Assets', ' Working Capital/Equity', " Net Income to Stockholder's Equity", ' Degree of Financial Leverage (DFL)']


- RFECV

<b> RFECV </b>: 확장된 버전 -> 교차검증 통해 최적의 특성 수를 자동으로 결정, 반복에서 특성 제거하면서 교차검증으로 수행하여 모델 평가 -> 특성 수 찾기

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

In [9]:
model= RandomForestClassifier()
cv= StratifiedKFold(3)

In [10]:
#RFECV 모델 만들기
selector =RFECV(estimator=model, step=1, cv=cv)
selector=selector.fit(X,y)

In [11]:
# 최적의 특성 수 , 선택된 특성 출력

print('최적의 피처 수 ', selector.n_features_)
print('최적의 피처 수 ', selector.support_)

최적의 피처 수  41
최적의 피처 수  [ True False  True False False False False False  True  True  True False
 False  True False  True  True  True  True False False False  True False
 False False False False False  True  True False False  True  True False
  True  True False  True False False  True False False  True  True  True
  True False  True  True  True  True False False  True  True  True False
 False  True False False  True False False False  True False False False
 False False False False False False False False False False False False
 False  True  True  True False  True  True  True  True False  True]


In [12]:
# 선택된 Feature 이름 출력
selected_features = [name for name, support in zip(data, selector.support_) if support]

print('선택된 Feature:', selected_features)

선택된 Feature: ['Bankrupt?', ' ROA(A) before interest and % after tax', ' After-tax net Interest Rate', ' Non-industry income and expenditure/revenue', ' Continuous interest rate (after tax)', ' Cash flow rate', ' Tax rate (A)', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)', ' Operating Profit Per Share (Yuan ¥)', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Current Ratio', ' Quick Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)', ' Operating profit/Paid-in capital', ' Total Asset Turnover', ' Accounts Receivable Turnover', ' Average Collection Days', ' Inventory Turnover Rate (times)', ' Net Worth Turnover Rate (times)', ' Revenue per person', ' Operating profit per person', ' Allocation rate per person', ' Current Assets/Total Assets', ' Cash/Total Assets', ' Quick Assets/Current Liability', ' Operating Funds to Liability', ' Current Liabilities/Liability', ' Retained Earnings to Total Assets

## 중간 정리
- <b> 상관관계로 선택된 Feature:  </b>
'Bankrupt?', ' Net Income to Total Assets', ' ROA(A) before interest and % after tax',
' ROA(B) before interest and depreciation after tax', ' ROA(C) before interest and depreciation before interest', ' Net worth/Assets',
' Debt ratio %', ' Persistent EPS in the Last Four Seasons', ' Retained Earnings to Total Assets', ' Net profit before tax/Paid-in capital'
,' Per Share Net profit before tax (Yuan ¥)'

- <b> RFE로 선택된 Feature: </b>
' Non-industry income and expenditure/revenue', ' Interest-bearing debt interest rate', ' Net Value Per Share (B)', ' Persistent EPS in the Last Four Seasons', ' Net Value Growth Rate', ' Quick Ratio', ' Interest Expense Ratio', ' Total debt/Total net worth', ' Borrowing dependency', ' Net profit before tax/Paid-in capital', ' Accounts Receivable Turnover', ' Cash/Total Assets', ' Working Capital/Equity', " Net Income to Stockholder's Equity", ' Degree of Financial Leverage (DFL)'
    
- <b> RFECV 로 선택된 Feature: </b>
'Bankrupt?', ' ROA(A) before interest and % after tax', ' After-tax net Interest Rate', ' Non-industry income and expenditure/revenue', ' Continuous interest rate (after tax)', ' Cash flow rate', ' Tax rate (A)', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)', ' Operating Profit Per Share (Yuan ¥)', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Current Ratio', ' Quick Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)', ' Operating profit/Paid-in capital', ' Total Asset Turnover', ' Accounts Receivable Turnover', ' Average Collection Days', ' Inventory Turnover Rate (times)', ' Net Worth Turnover Rate (times)', ' Revenue per person', ' Operating profit per person', ' Allocation rate per person', ' Current Assets/Total Assets', ' Cash/Total Assets', ' Quick Assets/Current Liability', ' Operating Funds to Liability', ' Current Liabilities/Liability', ' Retained Earnings to Total Assets', ' Liability-Assets Flag', ' Net Income to Total Assets', ' Total assets to GNP price', ' Gross Profit to Sales', " Net Income to Stockholder's Equity", ' Liability to Equity', ' Degree of Financial Leverage (DFL)', ' Net Income Flag'

In [None]:
# 상관관계로 선택된 Feature
correlation_features = {'Bankrupt?', 'Net Income to Total Assets', 'ROA(A) before interest and % after tax', 
                        'ROA(B) before interest and depreciation after tax', 'ROA(C) before interest and depreciation before interest', 
                        'Net worth/Assets', 'Debt ratio %', 'Persistent EPS in the Last Four Seasons', 
                        'Retained Earnings to Total Assets', 'Net profit before tax/Paid-in capital', ' Per Share Net profit before tax (Yuan ¥)'}

# RFE로 선택된 Feature
rfe_features = {' Non-industry income and expenditure/revenue', ' Interest-bearing debt interest rate', ' Net Value Per Share (B)',
                ' Persistent EPS in the Last Four Seasons', ' Net Value Growth Rate', ' Quick Ratio', ' Interest Expense Ratio',
                ' Total debt/Total net worth', ' Borrowing dependency', ' Net profit before tax/Paid-in capital',
                ' Accounts Receivable Turnover', ' Cash/Total Assets', ' Working Capital/Equity', " Net Income to Stockholder's Equity",
                ' Degree of Financial Leverage (DFL)'}

# RFECV로 선택된 Feature
rfecv_features = {'Bankrupt?', ' ROA(A) before interest and % after tax', ' After-tax net Interest Rate',
                  ' Non-industry income and expenditure/revenue', ' Continuous interest rate (after tax)', ' Cash flow rate',
                  ' Tax rate (A)', ' Net Value Per Share (B)', ' Net Value Per Share (A)', ' Net Value Per Share (C)',
                  ' Operating Profit Per Share (Yuan ¥)', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Current Ratio',
                  ' Quick Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)',
                  ' Operating profit/Paid-in capital', ' Total Asset Turnover', ' Accounts Receivable Turnover', ' Average Collection Days',
                  ' Inventory Turnover Rate (times)', ' Net Worth Turnover Rate (times)', ' Revenue per person', ' Operating profit per person',
                  ' Allocation rate per person', ' Current Assets/Total Assets', ' Cash/Total Assets', ' Quick Assets/Current Liability',
                  ' Operating Funds to Liability', ' Current Liabilities/Liability', ' Retained Earnings to Total Assets', ' Liability-Assets Flag',
                  ' Net Income to Total Assets', ' Total assets to GNP price', ' Gross Profit to Sales', " Net Income to Stockholder's Equity",
                  ' Liability to Equity', ' Degree of Financial Leverage (DFL)', ' Net Income Flag'}

# 세 방법 모두에서 선택된 Feature 찾기
common_features = correlation_features & rfe_features & rfecv_features

# 두 방법에서 겹치는 Feature 찾기
correlation_rfe_common = correlation_features & rfe_features
correlation_rfecv_common = correlation_features & rfecv_features
rfe_rfecv_common = rfe_features & rfecv_features

# 결과 출력
print('세 방법 모두에서 선택된 Feature:', common_features)
print('상관관계와 RFE에서 선택된 Feature:', correlation_rfe_common)
print('상관관계와 RFECV에서 선택된 Feature:', correlation_rfecv_common)
print('RFE와 RFECV에서 선택된 Feature:', rfe_rfecv_common)

select_feture = correlation_rfe_common | correlation_rfecv_common | rfe_rfecv_common
print('2개 이상 겹치는 Feature:', select_feture )


- <b> 2개 이상 겹치는 Feature: </b>
{'ROA(A) before interest and % after tax', 'Interest Expense Ratio', 'Net profit before tax/Paid-in capital', "Net Income to Stockholder's Equity", 'Persistent EPS in the Last Four Seasons', 'Bankrupt?', 'Total debt/Total net worth', 'Debt ratio %', 'Net Income to Total Assets', 'Degree of Financial Leverage (DFL)', 'Cash/Total Assets', 'Retained Earnings to Total Assets'}