# 특징 추출 예제 – scikit-learn 사용
scikit-learn Python 기계 학습 라이브러리는 기계 학습을 위한 RFE 구현을 제공합니다. RFE 변환을 사용하려면 먼저 추정치 인수를 통해 지정된 선택한 알고리즘과 인수를 선택하기 위해 n 개의 기능을 통해 선택할 기능수 로 클래스를 구성합니다. 다음의 예는 5 개의 중복 입력 기능이 있는 합성 분류 데이터 세트를 정의합니다. 그 런 다음 RFE를 사용하여 의사 결정 트리 알고리즘을 사용하여 5 개의 기능을 선택합니다.

## report which features were selected by RFE

In [1]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [12]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=42)

In [13]:
X

array([[-0.05935747, -0.81075922, -1.46291185, ..., -0.71333492,
        -0.38779067,  0.30975536],
       [ 1.35040799,  4.22073174,  1.56416182, ..., -1.35261676,
         0.56164546,  3.05553349],
       [ 1.06995768,  3.02959431, -2.68448393, ...,  1.67460575,
         1.4897756 ,  0.03734801],
       ...,
       [ 1.32117879,  3.00727694,  0.242322  , ..., -0.802865  ,
         0.67092995,  2.27887143],
       [-0.52716176,  0.70840414, -1.69716553, ..., -0.58513067,
        -0.57348277, -0.17506291],
       [-1.72169116, -0.78642113, -1.12858852, ...,  2.46859264,
        -0.00922643, -2.81226403]])

In [4]:
X.shape

(1000, 10)

In [7]:
y # classification 결과

array([1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,

In [5]:
y.shape

(1000,)

In [6]:
y.ndim

1

In [8]:
# define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [9]:
rfe

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [10]:
# fit RFE
rfe.fit(X,y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [11]:
# summarize all features
for i in range(X.shape[1]):
    print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected=False, Rank: 2
Column: 1, Selected=True, Rank: 1
Column: 2, Selected=False, Rank: 3
Column: 3, Selected=True, Rank: 1
Column: 4, Selected=True, Rank: 1
Column: 5, Selected=True, Rank: 1
Column: 6, Selected=False, Rank: 5
Column: 7, Selected=True, Rank: 1
Column: 8, Selected=False, Rank: 6
Column: 9, Selected=False, Rank: 4


---

자료 입력하여 실습

In [2]:
import pandas as pd

In [3]:
# import xls file
data = pd.read_csv('credit_cards_dataset.csv')

In [4]:
data = data.rename(columns={'default payment next month': 'DEFAULT_NEXT_MONTH'})

In [5]:
y = 'default.payment.next.month'
X = [name for name in data.columns if name not in [y, 'ID', 'Y_Value']]
print('y =',y)
print('X =',X)

y = default.payment.next.month
X = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


In [6]:
X = data[X]

In [7]:
X

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,...,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1,3,1,39,0,0,0,0,0,...,208365.0,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0
29996,150000.0,1,3,2,43,-1,-1,-1,-1,0,...,3502.0,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0
29997,30000.0,1,2,2,37,4,3,2,-1,0,...,2758.0,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0
29998,80000.0,1,3,1,41,1,-1,0,0,0,...,76304.0,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0


In [8]:
y = data['default.payment.next.month'] 

In [9]:
y

0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: default.payment.next.month, Length: 30000, dtype: int64

In [10]:
X.shape

(30000, 23)

In [11]:
y.shape

(30000,)

In [12]:
y.ndim

1

In [13]:
# define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [14]:
# fit RFE
rfe.fit(X,y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [15]:
# summarize all features
for i in range(X.shape[1]):
    print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected=False, Rank: 7
Column: 1, Selected=False, Rank: 16
Column: 2, Selected=False, Rank: 13
Column: 3, Selected=False, Rank: 17
Column: 4, Selected=False, Rank: 2
Column: 5, Selected=True, Rank: 1
Column: 6, Selected=False, Rank: 12
Column: 7, Selected=False, Rank: 14
Column: 8, Selected=False, Rank: 19
Column: 9, Selected=False, Rank: 18
Column: 10, Selected=False, Rank: 15
Column: 11, Selected=True, Rank: 1
Column: 12, Selected=True, Rank: 1
Column: 13, Selected=False, Rank: 11
Column: 14, Selected=True, Rank: 1
Column: 15, Selected=False, Rank: 10
Column: 16, Selected=True, Rank: 1
Column: 17, Selected=False, Rank: 5
Column: 18, Selected=False, Rank: 3
Column: 19, Selected=False, Rank: 6
Column: 20, Selected=False, Rank: 9
Column: 21, Selected=False, Rank: 8
Column: 22, Selected=False, Rank: 4
