# 특징 선택

In [None]:
warnings.simplefilter('ignore')

In [1]:
%%time
from sklearn.datasets import fetch_rcv1
rcv_train = fetch_rcv1(subset='train')
rcv_test = fetch_rcv1(subset='test')
X_train = rcv_train.data
y_train = rcv_train.target
X_test = rcv_test.data
y_test = rcv_test.target

# One-hot-Encoding된 라벨을 정수형으로 복원
classes = np.arange(rcv_train.target.shape[1])
y_train = y_train.dot(classes)
y_test = y_test.dot(classes)

print(X_train.shape)

(23149, 47236)
Wall time: 28.1 s


# 분산에 의한 선택

In [14]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.00001)
X_train_sel = selector.fit_transform(X_train)
X_test_sel = selector.transform(X_test)
X_train_sel.shape

(23149, 14330)

In [16]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

model = BernoulliNB()
model.fit(X_train, y_train)
print("train accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train))))
print("test accuracy :{:5.3f}".format(accuracy_score(y_test, model.predict(X_test))))

train accuracy:0.381
test accuracy :0.324


In [17]:
model = BernoulliNB()
model.fit(X_train_sel, y_train)
print("train accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train_sel))))
print("test accuracy :{:5.3f}".format(accuracy_score(y_test, model.predict(X_test_sel))))

train accuracy:0.529
test accuracy :0.441


# 단일 변수  선택

단일 변수 선택법은 각각의 독립변수를 하나만 사용한 예측모형의 성능을 이용하여 가장 분류성능 혹은 상관관계가 높은 변수만 선택하는 방법이다. 사이킷런 패키지의 feature_selection 서브패키지는 다음 성능지표를 제공한다.

- chi2: 카이제곱 검정 통계값

- f_classif: 분산분석(ANOVA) F검정 통계값

- mutual_info_classif: 상호정보량(mutual information)

In [21]:
from sklearn.feature_selection import chi2, SelectKBest

selector = SelectKBest(chi2, k=14330)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [23]:
model = BernoulliNB()
model.fit(X_train, y_train)
print("train_accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train))))
print("test_accuracy:{:5.3f}".format(accuracy_score(y_test, model.predict(X_test))))

train_accuracy:0.505
test_accuracy:0.438


# 다른 모형을 이용한 특성 중요도 계산

In [25]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

In [28]:
n_sample =10000
idx = np.random.choice(range(len(y_train)), n_sample)

model = ExtraTreesClassifier(n_estimators=50).fit(X_train[idx,:], y_train[idx])

In [29]:
selector = SelectFromModel(model, prefit=True, max_features=14330)
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)

In [30]:
%%time
model = BernoulliNB()
model.fit(X_train_sel, y_train)
print("train accuracy:{:5.3f}".format(accuracy_score(y_train, model.predict(X_train_sel))))
print("test accuracy :{:5.3f}".format(accuracy_score(y_test, model.predict(X_test_sel))))

train accuracy:0.630
test accuracy :0.511
Wall time: 10.3 s
