In [22]:
# Dimensionality Reduction Using Features Selection
# 3 types of feature selection methods: filter, wrapper, and embedded
# filter: select the best features by examining their statistical properties 
# wrapper: use trial and error to find the subset of features that produce models with the highes quality predictions 
# embedded: select the best feature subset as part or as an extension of a learning algo's training process

from sklearn import datasets, linear_model 
from sklearn.feature_selection import VarianceThreshold 
from sklearn.preprocessing import StandardScaler 
import pandas as pd 
import numpy as np 
from sklearn.datasets import load_iris 
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
import warnings 
from sklearn.datasets import make_regression 


In [5]:
# 10.1 Thresholding numerical feature variance
iris = datasets.load_iris() 

features = iris.data
target = iris.target 

thresholder = VarianceThreshold(threshold = 0.5)

features_high_variance = thresholder.fit_transform(features)

features_high_variance[0:3]

# Variance thresholding is motivated by the idea that features with low variance are likely less interesting 
# than features with high variance 

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [6]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [8]:
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

selector = VarianceThreshold() 
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [9]:
# 10.2 Thresholding Binary Feature Variance 

features = [[0, 1, 0], 
            [0, 1, 1], 
            [0, 1, 0], 
            [0, 1, 1], 
            [1, 0, 0]]

thresholder = VarianceThreshold(threshold = 0.75 * ( 1- 0.75))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [11]:
# 10.3 Handling highlt correlated features 
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

dataframe = pd.DataFrame(features)
corr_matrix = dataframe.corr().abs() 
print(dataframe)
print(corr_matrix)


   0  1  2
0  1  1  1
1  2  2  0
2  3  3  1
3  4  4  0
4  5  5  1
5  6  6  0
6  7  7  1
7  8  7  0
8  9  7  1
              0         1             2
0  1.000000e+00  0.976103  9.614813e-18
1  9.761034e-01  1.000000  3.450328e-02
2  9.614813e-18  0.034503  1.000000e+00


In [12]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)

[1]


In [13]:
dataframe.drop(columns = to_drop, axis=1)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1
5,6,0
6,7,1
7,8,0
8,9,1


In [14]:
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,-9.614813e-18
1,0.9761034,1.0,-0.03450328
2,-9.614813e-18,-0.034503,1.0


In [15]:
upper

Unnamed: 0,0,1,2
0,,0.976103,9.614813e-18
1,,,0.03450328
2,,,


In [18]:
# 10.4 removing irrelevant features for classification 
iris = load_iris() 
features = iris.data 
target = iris.target 

features = features.astype(int)

chi2_selector = SelectKBest(chi2, k = 2)
features_kbest = chi2_selector.fit_transform(features, target)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])



Original number of features: 4
Reduced number of features: 2


In [19]:
# if the features are quantitative, compute the anova f-value between each feature and the target vector
fvalue_selector = SelectKBest(f_classif, k = 2)
features_kbest = fvalue_selector.fit_transform(features, target)

print("original number of features:", features.shape[1])
print("reduced number of features:", features_kbest.shape[1])


original number of features: 4
reduced number of features: 2


In [21]:
# Instead of selecting a specific number of feautres, we can also use SelectPercentile to 
# select the top n perent of features
from sklearn.feature_selection import SelectPercentile 

fvalue_selector = SelectPercentile(f_classif, percentile = 75)
features_kbest = fvalue_selector.fit_transform(features, target)

print("original number of features:", features.shape[1])
print("reduced number of features:", features_kbest.shape[1])


original number of features: 4
reduced number of features: 3


<img src="assets/chi-square.png">

In [24]:
# 10.5 Recursively eliminating features

# Recursive feature elimination using cross validation (RFECV)

warnings.filterwarnings(action="ignore", module="scipy", message = "^internal gelsd")
features, target = make_regression(n_samples = 10000, n_features = 100, 
                                   n_informative = 2, random_state = 1)
ols = linear_model.LinearRegression()

# recursively eliminate features 
rfecv = RFECV(estimator = ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ,  0.12861379],
       [-1.07500204,  2.56148527,  0.47594291],
       [ 1.37940721, -1.77039484, -0.495306  ],
       ...,
       [-0.80331656, -1.60648007, -0.71695412],
       [ 0.39508844, -1.34564911, -0.13436769],
       [-0.55383035,  0.82880112,  2.16645571]], shape=(10000, 3))

In [25]:
rfecv.n_features_

np.int64(3)

In [26]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [27]:
rfecv.ranking_

array([77,  8, 40, 10, 12,  1, 42, 45, 11, 47, 33, 28, 91, 53, 90, 62, 56,
       31, 25, 85, 68, 78, 32, 20, 84, 79, 35, 86, 57,  4, 16, 92,  6, 60,
       50, 22, 74, 46, 73,  1, 41, 17, 94, 23,  3, 39, 48, 80, 72,  5,  2,
       93, 65, 81, 75, 87, 38, 30, 63, 19, 70, 61, 15,  9, 98, 18, 66, 21,
       64, 26, 44, 24,  1, 27,  7, 54, 37, 34, 13, 52, 89, 43, 71, 14, 58,
       36, 76, 49, 83, 88, 69, 59, 97, 55, 82, 67, 96, 29, 51, 95])