# Installing Libraries

In [None]:
!pip install -q scikit-learn==1.4.1.post1 numpy pandas altair statsmodels

이번 실습에서는 새로운 라이브러리를 추가적으로 설치한다. 바로, Python에서 다양한 통계적 분석을 할 수 있게 해주는 [statsmodels](https://www.statsmodels.org/stable/index.html)이다.

# Data Preparation

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd


data = load_breast_cancer(as_frame=True)
BC_NUM = pd.concat([data.data, data.target], axis=1)
BC_CAT = pd.read_csv('https://drive.usercontent.google.com/download?export=download&confirm=t&id=1VJU3unefbDhqFB5Y0axHUCAoErPCaqm9')

이번 실습에는 두 가지의 데이터를 사용해보자. 하나는 지난 실습에도 사용했었던, breast cancer 데이터셋이다.

In [None]:
BC_NUM

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


또 다른 데이터셋은 역시 breast cancer이지만, 모든 특성값이 범주형으로 구성된 데이터셋이다.

In [None]:
BC_CAT

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
...,...,...,...,...,...,...,...,...,...,...
281,50-59,ge40,30-34,6-8,yes,2,left,left_low,no,no-recurrence-events
282,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes,no-recurrence-events
283,30-39,premeno,30-34,6-8,yes,2,right,right_up,no,no-recurrence-events
284,50-59,premeno,15-19,0-2,no,2,right,left_low,no,no-recurrence-events


# Unsupervised Feature Selection

## Quality-based Method
이 방법은 넘어가기로 하자. 현재 준비한 데이터는 정상적으로 수집되었다고 가정하자.

## Variance-based Method
scikit-learn에서는 특성값 선택을 위한 다양한 함수 및 클래스를 [sklearn.feature_selection](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection) 패키지 안에 제공하고 있다. 오늘 실습할 많은 특성값 선택 방법들도 해당 패키지 내의 함수들을 주로 활용할 것이다.

먼저, 낮은 분산을 가진 데이터를 제거해보자.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold


X = BC_NUM.drop('target', axis=1)
selector = VarianceThreshold(
    threshold = 0.8 # 0.8보다 작은 분산을 가진 특성값은 제거한다.
).fit(X)

pd.DataFrame({
    'feature': selector.feature_names_in_,
    'variance': np.round(selector.variances_, 3),
})

Unnamed: 0,feature,variance
0,mean radius,12.397
1,mean texture,18.466
2,mean perimeter,589.403
3,mean area,123625.903
4,mean smoothness,0.0
5,mean compactness,0.003
6,mean concavity,0.006
7,mean concave points,0.002
8,mean symmetry,0.001
9,mean fractal dimension,0.0


위에서 출력한 분산값들을 보면 **mean_smootheness, mean_compactedness** 등의 특성값들이 0.8보다 낮은 분산을 가지고 있다. 물론, 이를 참조해서 일일이 제거해도 되겠지만 보다 간단하게는 다음과 같이 할 수 있다.

In [None]:
import pandas as pd


pd.DataFrame(
    selector.transform(X),
    columns=selector.get_feature_names_out()
)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,perimeter error,area error,worst radius,worst texture,worst perimeter,worst area
0,17.99,10.38,122.80,1001.0,8.589,153.40,25.380,17.33,184.60,2019.0
1,20.57,17.77,132.90,1326.0,3.398,74.08,24.990,23.41,158.80,1956.0
2,19.69,21.25,130.00,1203.0,4.585,94.03,23.570,25.53,152.50,1709.0
3,11.42,20.38,77.58,386.1,3.445,27.23,14.910,26.50,98.87,567.7
4,20.29,14.34,135.10,1297.0,5.438,94.44,22.540,16.67,152.20,1575.0
...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,7.673,158.70,25.450,26.40,166.10,2027.0
565,20.13,28.25,131.20,1261.0,5.203,99.04,23.690,38.25,155.00,1731.0
566,16.60,28.08,108.30,858.1,3.425,48.55,18.980,34.12,126.70,1124.0
567,20.60,29.33,140.10,1265.0,5.772,86.22,25.740,39.42,184.60,1821.0


0.8보다 낮은 분산을 지닌 특성값들은 깔끔하게 제거된 것을 알 수 있다.

## Multicollinearity-based Method
이번엔 단일 특성값의 분산이 아니라, 한 특성값과 다른 특성값이 가지는 상관 관계를 고려해서 **불필요한** 특성값을 제거해보자.


### Pairwise Correlation
한 특성값이 다른 특성값과 가지는 상관 관계를 보는 가장 단순한 방법은, Pandas Dataframe이 제공하는 상관 관계 함수를 사용하는 것이다.

In [None]:
X = BC_NUM.drop('target', axis=1)
X.corr('pearson')

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,-0.311631,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
mean texture,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,-0.076437,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
mean perimeter,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,-0.261477,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
mean area,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,-0.28311,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
mean smoothness,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
mean compactness,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,0.565369,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
mean concavity,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,0.336783,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
mean concave points,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,0.166917,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
mean symmetry,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,0.479921,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413
mean fractal dimension,-0.311631,-0.076437,-0.261477,-0.28311,0.584792,0.565369,0.336783,0.166917,0.479921,1.0,...,-0.253691,-0.051269,-0.205151,-0.231854,0.504942,0.458798,0.346234,0.175325,0.334019,0.767297


물론, 단순하다고 했지 보기 편하다고 하진 않았다. 통계적 유의확률과 함께 상관 관계를 보자.

In [None]:
from itertools import combinations
import scipy.stats as st


X = BC_NUM.drop('target', axis=1)
for a, b in combinations(X.columns, 2): # 서로 다른 특성값 2개를 뽑는 조합이다.
    res = st.pearsonr(X[a], X[b])
    r, p = res.statistic, res.pvalue
    print(f'{a} - {b}: {r:.2f} (p-value = {p:.2f})')

mean radius - mean texture: 0.32 (p-value = 0.00)
mean radius - mean perimeter: 1.00 (p-value = 0.00)
mean radius - mean area: 0.99 (p-value = 0.00)
mean radius - mean smoothness: 0.17 (p-value = 0.00)
mean radius - mean compactness: 0.51 (p-value = 0.00)
mean radius - mean concavity: 0.68 (p-value = 0.00)
mean radius - mean concave points: 0.82 (p-value = 0.00)
mean radius - mean symmetry: 0.15 (p-value = 0.00)
mean radius - mean fractal dimension: -0.31 (p-value = 0.00)
mean radius - radius error: 0.68 (p-value = 0.00)
mean radius - texture error: -0.10 (p-value = 0.02)
mean radius - perimeter error: 0.67 (p-value = 0.00)
mean radius - area error: 0.74 (p-value = 0.00)
mean radius - smoothness error: -0.22 (p-value = 0.00)
mean radius - compactness error: 0.21 (p-value = 0.00)
mean radius - concavity error: 0.19 (p-value = 0.00)
mean radius - concave points error: 0.38 (p-value = 0.00)
mean radius - symmetry error: -0.10 (p-value = 0.01)
mean radius - fractal dimension error: -0.04 (

역시나 엄청나게 많은 값들이 출력되었다. 이 대신에 다음과 같은 기준을 만족하는 특성값을 제거하자: 상관 관계는 0.7 이상, 유의 확률은 0.05 미만

In [None]:
from itertools import combinations
import scipy.stats as st


removed = set() # 강한 상관 관계가 있는 특성값을 여기에 저장하자.
X = BC_NUM.drop('target', axis=1)
for a, b in combinations(X.columns, 2): # 서로 다른 특성값 2개를 뽑는 조합이다.
    res = st.pearsonr(X[a], X[b])
    r, p = res.statistic, res.pvalue
    if r >= 0.7 and p < 0.05:
        print(f'{a} - {b}: {r:.2f} (p-value = {p:.2f})')
        removed.add(a)

mean radius - mean perimeter: 1.00 (p-value = 0.00)
mean radius - mean area: 0.99 (p-value = 0.00)
mean radius - mean concave points: 0.82 (p-value = 0.00)
mean radius - area error: 0.74 (p-value = 0.00)
mean radius - worst radius: 0.97 (p-value = 0.00)
mean radius - worst perimeter: 0.97 (p-value = 0.00)
mean radius - worst area: 0.94 (p-value = 0.00)
mean radius - worst concave points: 0.74 (p-value = 0.00)
mean texture - worst texture: 0.91 (p-value = 0.00)
mean perimeter - mean area: 0.99 (p-value = 0.00)
mean perimeter - mean concavity: 0.72 (p-value = 0.00)
mean perimeter - mean concave points: 0.85 (p-value = 0.00)
mean perimeter - area error: 0.74 (p-value = 0.00)
mean perimeter - worst radius: 0.97 (p-value = 0.00)
mean perimeter - worst perimeter: 0.97 (p-value = 0.00)
mean perimeter - worst area: 0.94 (p-value = 0.00)
mean perimeter - worst concave points: 0.77 (p-value = 0.00)
mean area - mean concave points: 0.82 (p-value = 0.00)
mean area - radius error: 0.73 (p-value = 0

역시나 많지만, 우리가 제거할 것을 따로 저장해두었다.

In [None]:
removed

{'area error',
 'compactness error',
 'concavity error',
 'mean area',
 'mean compactness',
 'mean concave points',
 'mean concavity',
 'mean fractal dimension',
 'mean perimeter',
 'mean radius',
 'mean smoothness',
 'mean texture',
 'perimeter error',
 'radius error',
 'worst area',
 'worst compactness',
 'worst concavity',
 'worst perimeter',
 'worst radius'}

이 특성값들을 제거하기만 하면 된다.

In [None]:
X.drop(removed, axis=1)

Unnamed: 0,mean symmetry,texture error,smoothness error,concave points error,symmetry error,fractal dimension error,worst texture,worst smoothness,worst concave points,worst symmetry,worst fractal dimension
0,0.2419,0.9053,0.006399,0.01587,0.03003,0.006193,17.33,0.16220,0.2654,0.4601,0.11890
1,0.1812,0.7339,0.005225,0.01340,0.01389,0.003532,23.41,0.12380,0.1860,0.2750,0.08902
2,0.2069,0.7869,0.006150,0.02058,0.02250,0.004571,25.53,0.14440,0.2430,0.3613,0.08758
3,0.2597,1.1560,0.009110,0.01867,0.05963,0.009208,26.50,0.20980,0.2575,0.6638,0.17300
4,0.1809,0.7813,0.011490,0.01885,0.01756,0.005115,16.67,0.13740,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...
564,0.1726,1.2560,0.010300,0.02454,0.01114,0.004239,26.40,0.14100,0.2216,0.2060,0.07115
565,0.1752,2.4630,0.005769,0.01678,0.01898,0.002498,38.25,0.11660,0.1628,0.2572,0.06637
566,0.1590,1.0750,0.005903,0.01557,0.01318,0.003892,34.12,0.11390,0.1418,0.2218,0.07820
567,0.2397,1.5950,0.006522,0.01664,0.02324,0.006185,39.42,0.16500,0.2650,0.4087,0.12400


### Variance Inflation Factor
이번에는 한 특성값이 다른 복수의 특성값들과 가지는 관계를 확인해보자. 분산 팽창 요인Variance Inflation Factor는 오늘 새롭게 설치한 [statsmodel.stats.outliers_influence.variance_inflation_factor](https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html)에 구현되어 있다.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


X = BC_NUM.drop('target', axis=1)
for i in range(len(X.columns)):
    vif = variance_inflation_factor(X, i)
    print(f'{X.columns[i]}: {vif:.2f}')

mean radius: 63306.17
mean texture: 251.05
mean perimeter: 58123.59
mean area: 1287.26
mean smoothness: 393.40
mean compactness: 200.98
mean concavity: 157.86
mean concave points: 154.24
mean symmetry: 184.43
mean fractal dimension: 629.68
radius error: 236.67
texture error: 24.68
perimeter error: 211.40
area error: 72.47
smoothness error: 26.17
compactness error: 44.92
concavity error: 33.24
concave points error: 53.70
symmetry error: 37.18
fractal dimension error: 27.53
worst radius: 9674.74
worst texture: 343.00
worst perimeter: 4487.78
worst area: 1138.76
worst smoothness: 375.60
worst compactness: 132.88
worst concavity: 86.31
worst concave points: 148.67
worst symmetry: 218.92
worst fractal dimension: 423.40


모든 값이 10보다 큰 것을 알 수 있다. 그렇다면, 이 데이터는 다중공선성 문제로 뒤범벅인 데이터일까? 그렇지 않다. VIF는 한 특성값을 다른 특성값들의 선형 조합으로 예측하는 **선형 모델**을 만든다. 그리고, 선형 모델은 **스케일**에 민감하다. 따라서 VIF를 계산하기 전에, 표준화를 해주자. 그리고 10 이상의 VIF를 가진 특성값들을 제거하자.

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler


removed = set()
X = BC_NUM.drop('target', axis=1)
scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

for i in range(len(X_std.columns)):
    vif = variance_inflation_factor(X_std, i)
    print(f'{X_std.columns[i]}: {vif:.2f}')
    if vif >= 10:
        removed.add(X_std.columns[i])

mean radius: 3806.12
mean texture: 11.88
mean perimeter: 3786.40
mean area: 347.88
mean smoothness: 8.19
mean compactness: 50.51
mean concavity: 70.77
mean concave points: 60.04
mean symmetry: 4.22
mean fractal dimension: 15.76
radius error: 75.46
texture error: 4.21
perimeter error: 70.36
area error: 41.16
smoothness error: 4.03
compactness error: 15.37
concavity error: 15.69
concave points error: 11.52
symmetry error: 5.18
fractal dimension error: 9.72
worst radius: 799.11
worst texture: 18.57
worst perimeter: 405.02
worst area: 337.22
worst smoothness: 10.92
worst compactness: 36.98
worst concavity: 31.97
worst concave points: 36.76
worst symmetry: 9.52
worst fractal dimension: 18.86


In [None]:
X.drop(removed, axis=1)

Unnamed: 0,mean smoothness,mean symmetry,texture error,smoothness error,symmetry error,fractal dimension error,worst symmetry
0,0.11840,0.2419,0.9053,0.006399,0.03003,0.006193,0.4601
1,0.08474,0.1812,0.7339,0.005225,0.01389,0.003532,0.2750
2,0.10960,0.2069,0.7869,0.006150,0.02250,0.004571,0.3613
3,0.14250,0.2597,1.1560,0.009110,0.05963,0.009208,0.6638
4,0.10030,0.1809,0.7813,0.011490,0.01756,0.005115,0.2364
...,...,...,...,...,...,...,...
564,0.11100,0.1726,1.2560,0.010300,0.01114,0.004239,0.2060
565,0.09780,0.1752,2.4630,0.005769,0.01898,0.002498,0.2572
566,0.08455,0.1590,1.0750,0.005903,0.01318,0.003892,0.2218
567,0.11780,0.2397,1.5950,0.006522,0.02324,0.006185,0.4087


무려 23개의 특성값이 제거되었다.

# Supervised Feature Selection
이번엔 레이블 값을 고려해서 특성값을 제거해보자.

## Filter Method
먼저 레이블 값과 특성값 간의 통계적 분석 등을 통해서 유의미한 특성값을 선택하는 Filter Method를 실습한다.

### Analysis of Variance, or ANOVA
먼저, 분산 분석을 해보자. 통계 분석이라서 복잡할 것 같지만(?) scikit-learn에서는 간단하게 분류Classification의 경우 [sklearn.feature_selection.f_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html#sklearn.feature_selection.f_classif), 회귀Regression의 경우, [sklearn.feature_selection.f_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression) 함수를 활용하면 된다.



In [None]:
import pandas as pd
from sklearn.feature_selection import f_classif


X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']

fstat, pvalue = f_classif(X,  y)
pd.DataFrame({
    'feature': X.columns,
    'fstat': fstat,
    'pvalue': np.round(pvalue, 3)
})

Unnamed: 0,feature,fstat,pvalue
0,mean radius,646.981021,0.0
1,mean texture,118.096059,0.0
2,mean perimeter,697.235272,0.0
3,mean area,573.060747,0.0
4,mean smoothness,83.651123,0.0
5,mean compactness,313.233079,0.0
6,mean concavity,533.793126,0.0
7,mean concave points,861.67602,0.0
8,mean symmetry,69.527444,0.0
9,mean fractal dimension,0.093459,0.76


* F 검정 통계량(fstat): 특성값과 레이블 값 간의 관계를 의미하는 값으로, 이 값이 커질수록 둘 사이의 관련성이 크다고 보면 된다.
* 유의 확률(pvalue): 특성값과 레이블 값의 관계가 없다는 귀무 가설Null Hypothesis를 지지할 확률과 같다. 이 값이 낮을수록 특성값과 레이블 값 간의 관련성이 통계적으로 유의미하게 있다고 볼 수 있다. 보통, 0.05 미만이라면 특성값과 레이블 값 간에 통계적으로 유의미한 관계가 있다고 본다.

위 결과를 참조해서, 통계적으로 유의미한 (pvalue < 0.05) 특성값을 선택할건지, 검정 통계량의 값이 높은 특성값을 선택할건지, 아니면 두 기준을 모두 고려할건지는 어디까지나 여러분의 선택에 달렸다.

### Person's Correation
만약, 특성값과 레이블값 간의 상관 계수를 보고 싶다면 [sklearn.feature_selection.r_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.r_regression.html#sklearn.feature_selection.r_regression)를 활용하면 된다. 단, 함수의 이름 그대로 회귀에서만 사용할 수 있으니 주의하도록 하자. 본 실습에서는 활용하는 데이터는 분류 과업이므로 넘어가도록 한다.

### Mutual Information
Mutual Information은 두 변수간의 상호 의존성Mutual Dependence를 측정하는 방법으로, 0일시 두 변수가 독립적임을 의미하며, 이 값이 클 수록 서로 관련성이 크다는 것을 의미한다. 역시, scikit-learn에서 분류의 경우 [sklearn.feature_selection.mutual_info_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif), 회귀의 경우 [sklearn.feature_selection.mutual_info_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html#sklearn.feature_selection.mutual_info_regression)을 사용하면 간단하게 해볼 수 있다.

In [None]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import numpy as np


X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']

mi = mutual_info_classif(
    X=X,
    y=y,
    discrete_features=False, # 특성값이 범주형일 때 True; 수치형이라면 False로 둔다.
    random_state=42
)
pd.DataFrame({
    'feature': X.columns,
    'mi': np.round(mi, 3)
})

Unnamed: 0,feature,mi
0,mean radius,0.362
1,mean texture,0.097
2,mean perimeter,0.402
3,mean area,0.36
4,mean smoothness,0.08
5,mean compactness,0.213
6,mean concavity,0.375
7,mean concave points,0.439
8,mean symmetry,0.066
9,mean fractal dimension,0.006


### $\chi^2$ Test
범주형 특성값과 범주형 레이블 값 간의 통계적 분석을 해보자. 이를 위해, 범주형 특성값으로만 구성된 breast cancer 데이터를 써볼 것이다.

그 전에, 현재 문자열로 구성된 범주형 특성값과 범주형 레이블 값을 숫자로 바꿀 필요가 있다. 이전에도 언급했듯이, 대부분의 기계 학습 모델 및 관련 함수들은 문자열을 직접적으로 다룰 수 없기 때문이다. 단, One-hot encoding 등을 할 필요는 없으며, 대신 임의의 숫자로만 바꿔주면 된다.

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder


X, y = BC_CAT.drop('class', axis=1), BC_CAT['class']

encoder_X = OrdinalEncoder().fit(X)
encoder_y = LabelEncoder().fit(y)

X_enc = pd.DataFrame(
    encoder_X.transform(X),
    columns=encoder_X.get_feature_names_out()
)
y_enc = pd.Series(encoder_y.transform(y), name='class')

In [None]:
X_enc

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,0.0
1,3.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3.0,0.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0
3,2.0,2.0,6.0,0.0,1.0,2.0,1.0,1.0,1.0
4,2.0,2.0,5.0,4.0,1.0,1.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...
281,3.0,0.0,5.0,5.0,1.0,1.0,0.0,1.0,0.0
282,3.0,2.0,4.0,4.0,1.0,1.0,0.0,1.0,1.0
283,1.0,2.0,5.0,5.0,1.0,1.0,1.0,4.0,0.0
284,3.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0


In [None]:
y_enc

0      1
1      0
2      1
3      0
4      1
      ..
281    0
282    0
283    0
284    0
285    0
Name: class, Length: 286, dtype: int64

[sklearn.preprocessing.OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder)는 다차원 범주형 특성값을 숫자로, [sklearn.preprocessing.LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)는 1차원 레이블 값을 숫자로 바꾸는데 활용한다.

이제, $\chi^2$ 검정을 해보자. [sklearn.feature_selection.chi2](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2)를 사용하면 간단하게 해볼 수 있다.

In [None]:
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np


chistat, pvalue = chi2(X_enc, y_enc)
pd.DataFrame({
    'feature': X_enc.columns,
    'chi-squared': chistat,
    'pvalue': np.round(pvalue, 3)
})

Unnamed: 0,feature,chi-squared,pvalue
0,age,0.563284,0.453
1,menopause,0.712378,0.399
2,tumor-size,5.534357,0.019
3,inv-nodes,84.840985,0.0
4,node-caps,34.33426,0.0
5,deg-malig,13.272755,0.0
6,breast,0.522775,0.47
7,breast-quad,0.848473,0.357
8,irradiat,8.197246,0.004


* $\chi^2$ 검정 통계량(chi-squared): 이 값이 클 수록 특성값과 레이블값의 관계가 커진다고 볼 수 있다.
* 유의 확률(pvalue): 특성값과 레이블 값의 관계가 없다는 귀무 가설Null Hypothesis를 지지할 확률과 같다. 이 값이 낮을수록 특성값과 레이블 값 간의 관련성이 통계적으로 유의미하게 있다고 볼 수 있다. 보통, 0.05 미만이라면 특성값과 레이블 값 간에 통계적으로 유의미한 관계가 있다고 본다.

분산 분석과 마찬가지로, 통계적으로 유의미한 (pvalue < 0.05) 특성값을 선택할건지, 검정 통계량의 값이 높은 특성값을 선택할건지, 아니면 두 기준을 모두 고려할건지는 어디까지나 여러분의 선택에 달렸다.

## Wrapper Method
이번엔 특성값 선택을 위해 다른 기계 학습 모델을 활용하는 Wrapper Method를 활용해보자. 이 실습에서는 두 가지의 학습 모델을 활용해보겠다:
* Random Forest ([sklearn.ensemble.RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier))
* Logistic Regression ([sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression))



### Select Features from a Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']
scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

model_rf = RandomForestClassifier(random_state=42).fit(X, y)
model_lr = LogisticRegression(random_state=42).fit(X_std, y)

pd.DataFrame(dict(
    feature=X.columns,
    rf=model_rf.feature_importances_,
    lr=np.ravel(model_lr.coef_)
))

Unnamed: 0,feature,rf,lr
0,mean radius,0.034843,-0.375457
1,mean texture,0.015225,-0.382031
2,mean perimeter,0.06799,-0.360971
3,mean area,0.060462,-0.439469
4,mean smoothness,0.007958,-0.167376
5,mean compactness,0.011597,0.560707
6,mean concavity,0.066917,-0.855295
7,mean concave points,0.107046,-0.962849
8,mean symmetry,0.003423,0.076312
9,mean fractal dimension,0.002615,0.328458


위 결과에서 볼 수 있듯이, scikit-learn의 기계 학습 모델 구현체에서는 특성값의 중요도를 **feature_importances_** 또는 **coef_** 속성으로 가져올 수 있다.

**feature_importances_**의 경우 기계 학습 모델마다 계산하는 방식이 다르다. 예를 들어, Random Forest의 경우에는 각 특성값이 Weak Learner들을 학습시킬 때 불순도Impurity를 얼마나 낮췄는지를 보여준다. XGBoost의 경우 (기본적으로는) 각 특성값이 Weak Learner의 분할Split 기준으로 사용된 횟수를 의미한다. 어찌되었든, 높을수록 해당 기계 학습 모델이 중요하게 여긴다는 것은 일맥상통한다.

**coef_**의 경우는 말 그대로 특성값에 대한 계수Coefficient를 나타낸다. 이 값이 양수라면 결과에 양의 영향(예. 레이블 값을 1로 예측하는데 기여)을 의미하며, 음수라면 그 반대를 의미한다. 양수이든 음수이든 특정한 레이블 값의 예측에 기여하는 것은 마찬가지므로, 보통 이 값의 절대값을 특성값의 중요도로 생각하면 된다.

아무튼, 이 값들을 바탕으로 특성값을 선택하면 된다. 하지만, 이게 귀찮다면 보다 간단하게 하는 방법이 있다. 바로, scikit-learn의 [sklearn.feature_selection.SelectFromModel](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel)을 활용하는 것이다.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']
scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

selector_rf = SelectFromModel(
    estimator = RandomForestClassifier(random_state=42),
    threshold='mean' # 특성값 중요도의 절대값들을 취한 후, 그 중에 평균 이상인 것만 고른다.
).fit(X, y)

selector_lr = SelectFromModel(
    estimator = LogisticRegression(random_state=42),
    threshold='mean' # 특성값 중요도의 절대값들을 취한 후, 그 중에 평균 이상인 것만 고른다.
).fit(X_std, y)

X_rf = pd.DataFrame(
    selector_rf.transform(X),
    columns=selector_rf.get_feature_names_out()
)

X_lr = pd.DataFrame(
    selector_lr.transform(X),
    columns=selector_lr.get_feature_names_out()
)

In [None]:
X_rf

Unnamed: 0,mean radius,mean perimeter,mean area,mean concavity,mean concave points,worst radius,worst perimeter,worst area,worst concavity,worst concave points
0,17.99,122.80,1001.0,0.30010,0.14710,25.380,184.60,2019.0,0.7119,0.2654
1,20.57,132.90,1326.0,0.08690,0.07017,24.990,158.80,1956.0,0.2416,0.1860
2,19.69,130.00,1203.0,0.19740,0.12790,23.570,152.50,1709.0,0.4504,0.2430
3,11.42,77.58,386.1,0.24140,0.10520,14.910,98.87,567.7,0.6869,0.2575
4,20.29,135.10,1297.0,0.19800,0.10430,22.540,152.20,1575.0,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.24390,0.13890,25.450,166.10,2027.0,0.4107,0.2216
565,20.13,131.20,1261.0,0.14400,0.09791,23.690,155.00,1731.0,0.3215,0.1628
566,16.60,108.30,858.1,0.09251,0.05302,18.980,126.70,1124.0,0.3403,0.1418
567,20.60,140.10,1265.0,0.35140,0.15200,25.740,184.60,1821.0,0.9387,0.2650


In [None]:
X_lr

Unnamed: 0,mean concavity,mean concave points,radius error,perimeter error,area error,compactness error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst concavity,worst concave points,worst symmetry
0,0.30010,0.14710,1.0950,8.589,153.40,0.04904,0.006193,25.380,17.33,184.60,2019.0,0.16220,0.7119,0.2654,0.4601
1,0.08690,0.07017,0.5435,3.398,74.08,0.01308,0.003532,24.990,23.41,158.80,1956.0,0.12380,0.2416,0.1860,0.2750
2,0.19740,0.12790,0.7456,4.585,94.03,0.04006,0.004571,23.570,25.53,152.50,1709.0,0.14440,0.4504,0.2430,0.3613
3,0.24140,0.10520,0.4956,3.445,27.23,0.07458,0.009208,14.910,26.50,98.87,567.7,0.20980,0.6869,0.2575,0.6638
4,0.19800,0.10430,0.7572,5.438,94.44,0.02461,0.005115,22.540,16.67,152.20,1575.0,0.13740,0.4000,0.1625,0.2364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.24390,0.13890,1.1760,7.673,158.70,0.02891,0.004239,25.450,26.40,166.10,2027.0,0.14100,0.4107,0.2216,0.2060
565,0.14400,0.09791,0.7655,5.203,99.04,0.02423,0.002498,23.690,38.25,155.00,1731.0,0.11660,0.3215,0.1628,0.2572
566,0.09251,0.05302,0.4564,3.425,48.55,0.03731,0.003892,18.980,34.12,126.70,1124.0,0.11390,0.3403,0.1418,0.2218
567,0.35140,0.15200,0.7260,5.772,86.22,0.06158,0.006185,25.740,39.42,184.60,1821.0,0.16500,0.9387,0.2650,0.4087


이처럼, 한 번에 선택된 특성값들로 구성된 데이터를 출력할 수 있다.

### Permutation Importance
기계 학습 모델이 갖고 있는 특성값 중요도는 (1) 학습 모델에 의존적이며, (2) 실제 성능에 미치는 영향을 알 수 없다는 단점이 있다. 이 대신에, 각 특성값들을 무작위로 섞은 후에 성능 측정치가 어떻게 변하는지를 기준으로 삼는 Permutation Importance를 사용해보자. 이 방법은 sklearn.feature_selection 패키지 안에 속하지 않고, [sklearn.inspection.permutation_importance](https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html#sklearn.inspection.permutation_importance)에 구현되어 있다.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance


scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

model_rf = RandomForestClassifier(random_state=42).fit(X, y)
model_lr = LogisticRegression(random_state=42).fit(X_std, y)

result_rf = permutation_importance(
    estimator=model_rf,
    X=X,
    y=y,
    scoring='accuracy', # 성능치는 Accuracy를 기준으로 삼는다.
    n_repeats=5, # 각 특성값 별로 몇 회나 섞어서 성능을 측정할지를 의미한다.
    random_state=42
)

result_lr = permutation_importance(
    estimator=model_lr,
    X=X,
    y=y,
    scoring='accuracy', # 성능치는 Accuracy를 기준으로 삼는다.
    n_repeats=5, # 각 특성값 별로 몇 회나 섞어서 성능을 측정할지를 의미한다.
    random_state=42
)

pd.DataFrame(dict(
    feature=X.columns,
    rf=result_rf.importances_mean,
    lr=result_lr.importances_mean
))

Unnamed: 0,feature,rf,lr
0,mean radius,0.0,0.0
1,mean texture,0.002812,0.0
2,mean perimeter,0.0,0.0
3,mean area,0.0,0.0
4,mean smoothness,0.0,0.0
5,mean compactness,0.0,0.0
6,mean concavity,0.000351,0.0
7,mean concave points,0.002812,0.0
8,mean symmetry,0.0,0.0
9,mean fractal dimension,0.0,0.0


초매개변수 **scoring**은 기준으로 삼을 성능 측정치를 의미하며, 이 [링크](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)에서 가능한 성능 측정치를 확인할 수 있다.


### Recursive Feature Elimination
이번엔 주어진 수만큼의 특성값이 남을때까지 (1) 기계 학습 모델 훈련, (2) 특성값 중요도가 낮은 특성값 제거를 반복하는 Recursive Feature Elimination을 해보자. scikit-learn에서는 [sklearn.feature_selection.RFE](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE)를 활용하면 된다.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']
scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

selector_rf = RFE(
    estimator = RandomForestClassifier(random_state=42),
    n_features_to_select=10, # 최종적으로 10개의 특성값만 선택한다
    step=1 # 중요도가 낮은 특성값을 제거할 때, 가장 중요도가 낮은 1개의 특성값만 제거한다.
).fit(X, y)

selector_lr = RFE(
    estimator = LogisticRegression(random_state=42),
    n_features_to_select=10, # 최종적으로 10개의 특성값만 선택한다
    step=1 # 중요도가 낮은 특성값을 제거할 때, 가장 중요도가 낮은 1개의 특성값만 제거한다.
).fit(X_std, y)

X_rf = pd.DataFrame(
    selector_rf.transform(X),
    columns=selector_rf.get_feature_names_out()
)

X_lr = pd.DataFrame(
    selector_lr.transform(X),
    columns=selector_lr.get_feature_names_out()
)

In [None]:
X_rf

Unnamed: 0,mean perimeter,mean area,mean concavity,mean concave points,worst radius,worst texture,worst perimeter,worst area,worst concavity,worst concave points
0,122.80,1001.0,0.30010,0.14710,25.380,17.33,184.60,2019.0,0.7119,0.2654
1,132.90,1326.0,0.08690,0.07017,24.990,23.41,158.80,1956.0,0.2416,0.1860
2,130.00,1203.0,0.19740,0.12790,23.570,25.53,152.50,1709.0,0.4504,0.2430
3,77.58,386.1,0.24140,0.10520,14.910,26.50,98.87,567.7,0.6869,0.2575
4,135.10,1297.0,0.19800,0.10430,22.540,16.67,152.20,1575.0,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...
564,142.00,1479.0,0.24390,0.13890,25.450,26.40,166.10,2027.0,0.4107,0.2216
565,131.20,1261.0,0.14400,0.09791,23.690,38.25,155.00,1731.0,0.3215,0.1628
566,108.30,858.1,0.09251,0.05302,18.980,34.12,126.70,1124.0,0.3403,0.1418
567,140.10,1265.0,0.35140,0.15200,25.740,39.42,184.60,1821.0,0.9387,0.2650


In [None]:
X_lr

Unnamed: 0,mean concave points,radius error,area error,compactness error,worst radius,worst texture,worst perimeter,worst area,worst concavity,worst concave points
0,0.14710,1.0950,153.40,0.04904,25.380,17.33,184.60,2019.0,0.7119,0.2654
1,0.07017,0.5435,74.08,0.01308,24.990,23.41,158.80,1956.0,0.2416,0.1860
2,0.12790,0.7456,94.03,0.04006,23.570,25.53,152.50,1709.0,0.4504,0.2430
3,0.10520,0.4956,27.23,0.07458,14.910,26.50,98.87,567.7,0.6869,0.2575
4,0.10430,0.7572,94.44,0.02461,22.540,16.67,152.20,1575.0,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...
564,0.13890,1.1760,158.70,0.02891,25.450,26.40,166.10,2027.0,0.4107,0.2216
565,0.09791,0.7655,99.04,0.02423,23.690,38.25,155.00,1731.0,0.3215,0.1628
566,0.05302,0.4564,48.55,0.03731,18.980,34.12,126.70,1124.0,0.3403,0.1418
567,0.15200,0.7260,86.22,0.06158,25.740,39.42,184.60,1821.0,0.9387,0.2650


### Sequential Feature Elimination
이번엔 특성값 중요도 대신에 성능에 미치는 영향을 기준으로 특성값 선택을 해보자. 복잡하게 구현할 필요없이 scikit-learn의 [sklearn.feature_selection.SequentialFeatureSelector](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html#sklearn.feature_selection.SequentialFeatureSelector)를 사용하면 된다.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector


X, y = BC_NUM.drop('target', axis=1), BC_NUM['target']
scaler = StandardScaler().fit(X)
X_std = pd.DataFrame(
    scaler.transform(X),
    columns=scaler.get_feature_names_out()
)

selector_rf = SequentialFeatureSelector(
    estimator = RandomForestClassifier(random_state=42),
    scoring='accuracy', # 성능치는 Accuracy를 기준으로 삼는다.
    n_features_to_select=10, # 최종적으로 10개의 특성값만 선택한다
    direction='forward' # 0개의 특성값부터 시작해서 한 개씩 올려나간다. "backward"로 설정하면 모든 특성값에서부터 한 개씩 제거한다.
).fit(X, y)

selector_lr = SequentialFeatureSelector(
    estimator = LogisticRegression(random_state=42),
    scoring='accuracy', # 성능치는 Accuracy를 기준으로 삼는다.
    n_features_to_select=10, # 최종적으로 10개의 특성값만 선택한다
    direction='forward' # 0개의 특성값부터 시작해서 한 개씩 올려나간다. "backward"로 설정하면 모든 특성값에서부터 한 개씩 제거한다.
).fit(X_std, y)

X_rf = pd.DataFrame(
    selector_rf.transform(X),
    columns=selector_rf.get_feature_names_out()
)

X_lr = pd.DataFrame(
    selector_lr.transform(X),
    columns=selector_lr.get_feature_names_out()
)

In [None]:
X_rf

Unnamed: 0,mean texture,mean area,smoothness error,symmetry error,worst radius,worst texture,worst perimeter,worst smoothness,worst concavity,worst fractal dimension
0,10.38,1001.0,0.006399,0.03003,25.380,17.33,184.60,0.16220,0.7119,0.11890
1,17.77,1326.0,0.005225,0.01389,24.990,23.41,158.80,0.12380,0.2416,0.08902
2,21.25,1203.0,0.006150,0.02250,23.570,25.53,152.50,0.14440,0.4504,0.08758
3,20.38,386.1,0.009110,0.05963,14.910,26.50,98.87,0.20980,0.6869,0.17300
4,14.34,1297.0,0.011490,0.01756,22.540,16.67,152.20,0.13740,0.4000,0.07678
...,...,...,...,...,...,...,...,...,...,...
564,22.39,1479.0,0.010300,0.01114,25.450,26.40,166.10,0.14100,0.4107,0.07115
565,28.25,1261.0,0.005769,0.01898,23.690,38.25,155.00,0.11660,0.3215,0.06637
566,28.08,858.1,0.005903,0.01318,18.980,34.12,126.70,0.11390,0.3403,0.07820
567,29.33,1265.0,0.006522,0.02324,25.740,39.42,184.60,0.16500,0.9387,0.12400


In [None]:
X_lr

Unnamed: 0,mean radius,mean concavity,mean symmetry,mean fractal dimension,concavity error,worst radius,worst texture,worst perimeter,worst smoothness,worst symmetry
0,17.99,0.30010,0.2419,0.07871,0.05373,25.380,17.33,184.60,0.16220,0.4601
1,20.57,0.08690,0.1812,0.05667,0.01860,24.990,23.41,158.80,0.12380,0.2750
2,19.69,0.19740,0.2069,0.05999,0.03832,23.570,25.53,152.50,0.14440,0.3613
3,11.42,0.24140,0.2597,0.09744,0.05661,14.910,26.50,98.87,0.20980,0.6638
4,20.29,0.19800,0.1809,0.05883,0.05688,22.540,16.67,152.20,0.13740,0.2364
...,...,...,...,...,...,...,...,...,...,...
564,21.56,0.24390,0.1726,0.05623,0.05198,25.450,26.40,166.10,0.14100,0.2060
565,20.13,0.14400,0.1752,0.05533,0.03950,23.690,38.25,155.00,0.11660,0.2572
566,16.60,0.09251,0.1590,0.05648,0.04730,18.980,34.12,126.70,0.11390,0.2218
567,20.60,0.35140,0.2397,0.07016,0.07117,25.740,39.42,184.60,0.16500,0.4087


## Embedded Method
정규화 선형 모델이나 Gradient Boosting에서 다룬 알고리즘들은 기계 학습 모델의 훈련 과정에서 특성값 선택이 포함되어 있다. 이미 이전 실습 시간에 해봤으니 넘어가도록 하자.

## Ensemble Method
지금까지의 과정을 보면, 각 방법별로 중요하다고 여겨지는 특성값들이 조금씩 다른걸 알 수가 있다. 여러가지 방법에서 공통적으로 중요하다고 여겨지는 특성값을 선택하는 것도 좋은 전략 중에 하나다. 위에서 실습한 방법들의 결과를 하나로 합치는 것은 직접 해보도록 하자.