## 今天的範例，帶著大家一起如何找到好特徵

In [1]:
# library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import math
import statistics
import seaborn as sns
from IPython.display import display
import sklearn
print(sklearn.__version__)
#如果只有 0.19 記得要更新至 最新版本
%matplotlib inline

# 特徵選取會用到的函數
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing



0.23.2


## 產生一組資料集

In [2]:
#用字典產生一組資料
data={'sex': ['Male','Male','Male','Male','Male','Female','Female','Female','Female','Female','Male','Male','Male','Male','Male','Female','Female','Female','Female','Female'],
      'insomnia':['Y','N','N','N','N','N','Y','Y','Y','N','Y','N','N','N','N','N','Y','Y','Y','N'],
     'age':[23,40,5,30,1,40,16,27,43,8,23,39,5,29,1,42,13,29,41,10],
     'height':[180,170,100,176,70,160,170,166,155,35,170,168,101,175,72,163,169,163,151,40],
     'weight':[100,68,20,70,10,45,50,58,58,17,101,65,22,79,12,40,53,52,56,14]}
#轉成 dataframe格式 
data=pd.DataFrame(data)
display(data)

print(data.info())

Unnamed: 0,sex,insomnia,age,height,weight
0,Male,Y,23,180,100
1,Male,N,40,170,68
2,Male,N,5,100,20
3,Male,N,30,176,70
4,Male,N,1,70,10
5,Female,N,40,160,45
6,Female,Y,16,170,50
7,Female,Y,27,166,58
8,Female,Y,43,155,58
9,Female,N,8,35,17


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sex       20 non-null     object
 1   insomnia  20 non-null     object
 2   age       20 non-null     int64 
 3   height    20 non-null     int64 
 4   weight    20 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 928.0+ bytes
None


In [3]:
num_features = []
for dtype, feature in zip(data.dtypes, data.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

3 Numeric Features : ['age', 'height', 'weight']



In [4]:
cat_features = []
for dtype, feature in zip(data.dtypes, data.columns):
    if dtype == 'object':
        cat_features.append(feature)
print(f'{len(cat_features)} category Features : {cat_features}\n')

2 category Features : ['sex', 'insomnia']



---
### 特徵選取的三大方法
* 過濾法(Filter) 
* 包裝法(Wrapper)
* 嵌入法(Embedded)

* [1.13. Feature selection 語法](https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold)

---
## 過濾法(Filter): 移除低變異數的特徵
* [VarianceThreshold 語法](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html)


過濾法是列入一些篩選特徵的標準，把具變化性以及與目標變數相關的特徵，挑選出具變化性以及中高度相關的特徵，方法包含:
* 移除低變異數的特徵，適用於數值型態資料
* 單變量特徵選取（Univariate Feature Selection）
   * 目標變數為離散型，採用卡方檢定(chi2)
   * 目標變數為連續型，可採用 f_regression

In [11]:
#設定變異數的過濾門檻
filter1 = VarianceThreshold(threshold=(100))
data_filter1 = filter1.fit_transform(data[num_features])
display(data_filter1)
print("Variances is %s" % filter1.variances_) #計算每一個特徵的變異數
print("The support is %s" % filter1.get_support(True)) #確定那些特徵符合條件留下

array([[ 23, 180, 100],
       [ 40, 170,  68],
       [  5, 100,  20],
       [ 30, 176,  70],
       [  1,  70,  10],
       [ 40, 160,  45],
       [ 16, 170,  50],
       [ 27, 166,  58],
       [ 43, 155,  58],
       [  8,  35,  17],
       [ 23, 170, 101],
       [ 39, 168,  65],
       [  5, 101,  22],
       [ 29, 175,  79],
       [  1,  72,  12],
       [ 42, 163,  40],
       [ 13, 169,  53],
       [ 29, 163,  52],
       [ 41, 151,  56],
       [ 10,  40,  14]])

Variances is [ 210.6875 2224.51    715.05  ]
The support is [0 1 2]


### 以 變異數為 100 為臨界值，超過的特徵留下，小於的特徵去除。
這個例子中，'age', 'height', 'weight' 都會留下。
大家可以試著調整不同的臨界值，看會有甚麼變化。

In [13]:
#設定變異數的過濾門檻
filter1 = VarianceThreshold(threshold=(500))
data_filter1 = filter1.fit_transform(data[num_features])
display(data_filter1)
print("Variances is %s" % filter1.variances_) #計算每一個特徵的變異數
print("The support is %s" % filter1.get_support(True)) #確定那些特徵符合條件留下

array([[180, 100],
       [170,  68],
       [100,  20],
       [176,  70],
       [ 70,  10],
       [160,  45],
       [170,  50],
       [166,  58],
       [155,  58],
       [ 35,  17],
       [170, 101],
       [168,  65],
       [101,  22],
       [175,  79],
       [ 72,  12],
       [163,  40],
       [169,  53],
       [163,  52],
       [151,  56],
       [ 40,  14]])

Variances is [ 210.6875 2224.51    715.05  ]
The support is [1 2]


## 觀察: 會發現不一樣的資料變動範圍，有著不同的變異數，所以很難設定統一的過濾標準
建議先把數值型資料建置統一的標準化動作

In [14]:
# 先標準化，才能用統一標準篩選，z = (x - u) / s
# std = preprocessing.StandardScaler()
# 不能用標準化，因為這樣的變異數都是 1 ~
std = preprocessing.MinMaxScaler()
data_std = std.fit_transform(data[num_features])
display(data_std)
#變成 array 型態

array([[0.52380952, 1.        , 0.98901099],
       [0.92857143, 0.93103448, 0.63736264],
       [0.0952381 , 0.44827586, 0.10989011],
       [0.69047619, 0.97241379, 0.65934066],
       [0.        , 0.24137931, 0.        ],
       [0.92857143, 0.86206897, 0.38461538],
       [0.35714286, 0.93103448, 0.43956044],
       [0.61904762, 0.90344828, 0.52747253],
       [1.        , 0.82758621, 0.52747253],
       [0.16666667, 0.        , 0.07692308],
       [0.52380952, 0.93103448, 1.        ],
       [0.9047619 , 0.91724138, 0.6043956 ],
       [0.0952381 , 0.45517241, 0.13186813],
       [0.66666667, 0.96551724, 0.75824176],
       [0.        , 0.25517241, 0.02197802],
       [0.97619048, 0.88275862, 0.32967033],
       [0.28571429, 0.92413793, 0.47252747],
       [0.66666667, 0.88275862, 0.46153846],
       [0.95238095, 0.8       , 0.50549451],
       [0.21428571, 0.03448276, 0.04395604]])

In [16]:
# 是初步篩選，不要太嚴格
filter1 = VarianceThreshold(threshold=(0.05))
data_filter1 = filter1.fit_transform(data_std)
display(data_filter1)

# fit 後，才能呈現這樣的算法
print("Variances is %s" % filter1.variances_)
print("After transform is %s" % filter1.transform(data_std))
print("The support is %s" % filter1.get_support(True)) #確定那些變數符合條件留下
print("After reverse transform is %s" % filter1.inverse_transform(filter1.transform(data_std)))

array([[0.52380952, 1.        , 0.98901099],
       [0.92857143, 0.93103448, 0.63736264],
       [0.0952381 , 0.44827586, 0.10989011],
       [0.69047619, 0.97241379, 0.65934066],
       [0.        , 0.24137931, 0.        ],
       [0.92857143, 0.86206897, 0.38461538],
       [0.35714286, 0.93103448, 0.43956044],
       [0.61904762, 0.90344828, 0.52747253],
       [1.        , 0.82758621, 0.52747253],
       [0.16666667, 0.        , 0.07692308],
       [0.52380952, 0.93103448, 1.        ],
       [0.9047619 , 0.91724138, 0.6043956 ],
       [0.0952381 , 0.45517241, 0.13186813],
       [0.66666667, 0.96551724, 0.75824176],
       [0.        , 0.25517241, 0.02197802],
       [0.97619048, 0.88275862, 0.32967033],
       [0.28571429, 0.92413793, 0.47252747],
       [0.66666667, 0.88275862, 0.46153846],
       [0.95238095, 0.8       , 0.50549451],
       [0.21428571, 0.03448276, 0.04395604]])

Variances is [0.11943736 0.10580309 0.08634827]
After transform is [[0.52380952 1.         0.98901099]
 [0.92857143 0.93103448 0.63736264]
 [0.0952381  0.44827586 0.10989011]
 [0.69047619 0.97241379 0.65934066]
 [0.         0.24137931 0.        ]
 [0.92857143 0.86206897 0.38461538]
 [0.35714286 0.93103448 0.43956044]
 [0.61904762 0.90344828 0.52747253]
 [1.         0.82758621 0.52747253]
 [0.16666667 0.         0.07692308]
 [0.52380952 0.93103448 1.        ]
 [0.9047619  0.91724138 0.6043956 ]
 [0.0952381  0.45517241 0.13186813]
 [0.66666667 0.96551724 0.75824176]
 [0.         0.25517241 0.02197802]
 [0.97619048 0.88275862 0.32967033]
 [0.28571429 0.92413793 0.47252747]
 [0.66666667 0.88275862 0.46153846]
 [0.95238095 0.8        0.50549451]
 [0.21428571 0.03448276 0.04395604]]
The support is [0 1 2]
After reverse transform is [[0.52380952 1.         0.98901099]
 [0.92857143 0.93103448 0.63736264]
 [0.0952381  0.44827586 0.10989011]
 [0.69047619 0.97241379 0.65934066]
 [0.         0.241

---
## 單變量特徵選取（Univariate Feature Selection）
* 離散型和數值型資料皆可
* y: 離散型: 
    * 卡方:Compute chi-squared stats between each non-negative feature and class.
* y: 數值型: regression: 
    * The correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).

## 執行單變量特徵選取（Univariate Feature Selection）步驟:
* Step1: 根據目標變量是連續或離散，來決定判斷的準則，
    * 離散:卡方檢定 ( chi2 )
    * 連續: 迴歸係數 ( f_regression )
* Step2: 根據準則大小，依序選取單變量個數
    * [SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest) : 選取 K 個最好的特徵, k 為參數，代表你想選擇多少特徵。
    * [SelectPercentile](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html#sklearn.feature_selection.SelectPercentile) : 選取多少百分比的特徵，percentile 為參數，代表百分比，用 10 代表 10%。


In [18]:
#離散型資料+連續
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
##ValueError: could not convert string to float: 'Male'
#離散型資料要先轉成數值
sex_mapping = {
           'Male': 1,
           'Female': 0
}
data['sex1'] = data['sex'].map(sex_mapping)

x = data[['sex1','age', 'height', 'weight']]
display(x)
y = data['insomnia']

x_new = SelectKBest(chi2, k=2).fit_transform(x, y)
display(x_new.shape)
display(x_new)

Unnamed: 0,sex1,age,height,weight
0,1,23,180,100
1,1,40,170,68
2,1,5,100,20
3,1,30,176,70
4,1,1,70,10
5,0,40,160,45
6,0,16,170,50
7,0,27,166,58
8,0,43,155,58
9,0,8,35,17


(20, 2)

array([[180, 100],
       [170,  68],
       [100,  20],
       [176,  70],
       [ 70,  10],
       [160,  45],
       [170,  50],
       [166,  58],
       [155,  58],
       [ 35,  17],
       [170, 101],
       [168,  65],
       [101,  22],
       [175,  79],
       [ 72,  12],
       [163,  40],
       [169,  53],
       [163,  52],
       [151,  56],
       [ 40,  14]])

In [19]:
#離散型資料+連續
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
##ValueError: could not convert string to float: 'Male'
#離散型資料要先轉成數值
sex_mapping = {
           'Male': 1,
           'Female': 0
}
data['sex1'] = data['sex'].map(sex_mapping)

x=data[['sex1','age', 'height', 'weight']]
display(x)
y=data['insomnia']
x_new = SelectPercentile(chi2, percentile=50).fit_transform(x, y)
display(x_new.shape)
display(x_new)

Unnamed: 0,sex1,age,height,weight
0,1,23,180,100
1,1,40,170,68
2,1,5,100,20
3,1,30,176,70
4,1,1,70,10
5,0,40,160,45
6,0,16,170,50
7,0,27,166,58
8,0,43,155,58
9,0,8,35,17


(20, 2)

array([[180, 100],
       [170,  68],
       [100,  20],
       [176,  70],
       [ 70,  10],
       [160,  45],
       [170,  50],
       [166,  58],
       [155,  58],
       [ 35,  17],
       [170, 101],
       [168,  65],
       [101,  22],
       [175,  79],
       [ 72,  12],
       [163,  40],
       [169,  53],
       [163,  52],
       [151,  56],
       [ 40,  14]])

* 以卡方分配來看，和失眠狀態最相關的變數為 height, weight
---

In [20]:
# 假設用失眠狀態、身高，來預測體重--->體重連續型變數，迴歸問題。
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

##ValueError: could not convert string to float: 'Male'
#離散型資料要先轉成數值
insomnia_mapping = {
           'Y': 1,
           'N': 0
}
data['insomnia1'] = data['insomnia'].map(insomnia_mapping)

sex_mapping = {
           'Male': 1,
           'Female': 0
}
data['sex1'] = data['sex'].map(sex_mapping)

x = data[['sex1','age', 'height','insomnia1']]
display(x)
y = data['weight']
x_new = SelectPercentile(chi2, percentile=50).fit_transform(x, y)
display(x_new.shape)
display(x_new)

Unnamed: 0,sex1,age,height,insomnia1
0,1,23,180,1
1,1,40,170,0
2,1,5,100,0
3,1,30,176,0
4,1,1,70,0
5,0,40,160,0
6,0,16,170,1
7,0,27,166,1
8,0,43,155,1
9,0,8,35,0


(20, 2)

array([[ 23, 180],
       [ 40, 170],
       [  5, 100],
       [ 30, 176],
       [  1,  70],
       [ 40, 160],
       [ 16, 170],
       [ 27, 166],
       [ 43, 155],
       [  8,  35],
       [ 23, 170],
       [ 39, 168],
       [  5, 101],
       [ 29, 175],
       [  1,  72],
       [ 42, 163],
       [ 13, 169],
       [ 29, 163],
       [ 41, 151],
       [ 10,  40]])

* 以迴歸係數來看，預測體重最佳的變數為 age, height。

---
## 包裝法
包裝法將特徵選擇看作是搜索問題，根據某一種評量標準，每次選擇某些特徵或排除某些特徵，方法包含
遞歸特徵消除(RFE)
*  最常用的方法
Feature ranking with recursive feature elimination.
* [語法](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE)
    * step corresponds to the (integer) number of features to remove at each iteration
    * A supervised learning estimator with a fit method that provides information about feature importance either through a coef_ attribute or through a feature_importances_ attribute.

## 以課程案例來看，預測失眠狀態，為離散分類問題
* 所以指標採用 SVC(kernel='linear')
* [語法](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC)

In [21]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
#ValueError: could not convert string to float: 'Male'
#離散要轉換成數值
sex_mapping = {
           'Male': 1,
           'Female': 0
}
data['sex1'] = data['sex'].map(sex_mapping)

x = data[['sex1','age', 'height', 'weight']]
display(x)
#insomnia_mapping = {
#           'Y': 1,
#           'N': 0}
#data['insomnia1'] = data['insomnia'].map(insomnia_mapping)

y = data['insomnia1']

estimator = SVC(kernel="linear")
selector = RFE(estimator, n_features_to_select=2, step=1)
selector = selector.fit(x, y)

## True= selected feature
print('Support:', selector.support_)

#Selected (i.e., estimated best) features are assigned rank 1.
ranking = selector.ranking_
print('Ranking:', ranking)

rfe_feature = x.loc[:, selector.support_].columns.tolist()
print('RFE_feature:', rfe_feature)

Unnamed: 0,sex1,age,height,weight
0,1,23,180,100
1,1,40,170,68
2,1,5,100,20
3,1,30,176,70
4,1,1,70,10
5,0,40,160,45
6,0,16,170,50
7,0,27,166,58
8,0,43,155,58
9,0,8,35,17


Support: [ True False False  True]
Ranking: [1 2 3 1]
RFE_feature: ['sex1', 'weight']


* 採用 包裝法 時，找出最能分辨出失眠狀態的特徵 ['sex', 'weight']