In [1]:
from sklearn import feature_selection as fs
from sklearn import datasets

# Removing features with low variance
## 設定變異數門檻值，剔除變異過低的特徵
## $$Var(X)=E[(X-\mu)^2]$$
## Bernoulli distribution: $$ Var(X)=p(1-P)$$

In [3]:
#Ivy: three features(columns), the variance of the first feature is quite low

X=[[0, 0, 1],
   [0, 1, 0],
   [1, 0, 0], 
   [0, 1, 1], 
   [0, 1, 0], 
   [0, 1, 1]]

In [4]:
#Ivy: after variance screening, the first feature is removed.
sel=fs.VarianceThreshold(threshold=0.8*(1-0.8))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

# Univariate feature selection
## 透過單獨計算每個特徵的統計值來決定重要特徵
## 1. SelectKBest: 選取排名前K個重要特徵
## 2. SelectPercentile: 選取排名前K%的重要特徵
## For regression:
### f_regression, mutual_info_regression
## For classification:
### chi2, f_classif, mutual_info_classif

In [None]:
#Ivy: 基本上就是我們的單變項統計檢定

In [5]:
iris=datasets.load_iris()
X=iris.data
y=iris.target

In [18]:
#Ivy's notes and practice
#Step 1: 先了解資料shapte
print("type(X):",type(X),'\t',"dim(X):", X.ndim, '\t', "X shape:", X.shape,'\n',\
      "type(y):",type(y),'\t',"dim(y):", y.ndim, '\t', "y shape:", y.shape,'\n')

print('X[:10,]','\n', X[:10,], '\n',\
      'y[:10,]','\n', y[:10,])

type(X): <class 'numpy.ndarray'> 	 dim(X): 2 	 X shape: (150, 4) 
 type(y): <class 'numpy.ndarray'> 	 dim(y): 1 	 y shape: (150,) 

X[:10,] 
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]] 
 y[:10,] 
 [0 0 0 0 0 0 0 0 0 0]


### 1. SelectKBest: 選取排名前K個重要特徵

In [23]:
#Ivy's notes and practice:
#1. Chi2 to select X features
#   fs press 'tab' key to select the chi2 function
#   classification of two categories
#   input X and y to fit_transform and get new selected X_new
X_new = fs.SelectKBest(fs.chi2, k = 3).fit_transform(X,y)
print(X_new[:10])

[[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]
 [5.4 1.7 0.4]
 [4.6 1.4 0.3]
 [5.  1.5 0.2]
 [4.4 1.4 0.2]
 [4.9 1.5 0.1]]


### 2. SelectPercentile: 選取排名前K%的重要特徵

In [26]:
X_new2 = fs.SelectPercentile(fs.mutual_info_classif, percentile = 50).fit_transform(X,y)
print(X_new2[:10])

[[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]]
