In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA

# Standard Scale

#More info: https://medium.com/greyatom/why-how-and-when-to-scale-your-features-4b30ab09db5e


In [2]:
X = np.array([[1, 65],[3, 130],[2, 80],[2, 70],[1, 50]])

X_t = (X - X.mean(axis=0))/X.std(axis=0)


print(X_t)
print("mean={}, std{}".format(X_t.mean(axis=0), X_t.std(axis=0)))


[[-1.06904497 -0.5132649 ]
 [ 1.60356745  1.86975072]
 [ 0.26726124  0.03666178]
 [ 0.26726124 -0.32995601]
 [-1.06904497 -1.06319158]]
mean=[-8.8817842e-17  0.0000000e+00], std[1. 1.]


======================================================================================================

# Feature Selection methods

# Removing features with low variance
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

In [21]:
from sklearn.feature_selection import VarianceThreshold

iris = load_iris()
X = iris.data

for i in range(len(X[0])):
    print(' min:', np.min(X[:,i]), ' max:', np.max(X[:,i]), ' mean:', np.mean(X[:,i]), ' var:', np.var(X[:,i]))
    #var = mean(abs(x - x.mean())**2)
    print('-------------------')


print(X.shape)

# complete this part =================================================

variance_selector = VarianceThreshold(threshold=(0.25))
X = variance_selector.fit_transform(X)


print(X.shape)

 min: 4.3  max: 7.9  mean: 5.843333333333334  var: 0.6811222222222223
-------------------
 min: 2.0  max: 4.4  mean: 3.0573333333333337  var: 0.1887128888888889
-------------------
 min: 1.0  max: 6.9  mean: 3.7580000000000005  var: 3.0955026666666665
-------------------
 min: 0.1  max: 2.5  mean: 1.1993333333333336  var: 0.5771328888888888
-------------------
(150, 4)
(150, 3)


# Univariate feature selection

### SelectKBest
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

iris = load_iris()
X, y = iris.data, iris.target
X_new = SelectKBest(chi2, k=3).fit(X,y)

# complete this part =================================================

features = X_new.transform(X)

print(X_new.scores_)
print(features[0:5,:])

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
[[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]]


### SelectPercentile
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html

In [7]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

data_experiment = load_iris()
X, y = data_experiment.data, data_experiment.target
print(X.shape)

X_new = SelectPercentile(chi2, percentile=60).fit_transform(X, y)

print(X_new.shape)

(150, 4)
(150, 2)


# Recursive Feature Elimination
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

In [19]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR


data_experiment = load_iris()
X, y = data_experiment.data, data_experiment.target

print(X.shape)

estimator = SVR(kernel="linear")
# step: If greater than or equal to 1,
# then step corresponds to the (integer) number of features to remove at each iteration.
selector = RFE(estimator, n_features_to_select =2, step=2, verbose=True)

selector = selector.fit(X, y)
X_new = selector.transform(X)

print(X_new.shape)

print(selector.support_)
print(selector.ranking_)

(150, 4)
Fitting estimator with 4 features.
(150, 2)
[False False  True  True]
[2 2 1 1]


### Recursive feature elimination with cross validation
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html

In [17]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier

data_experiment = load_iris()
X = data_experiment.data
y = data_experiment.target
print(X.shape)

# feature extraction
#estimator = RandomForestClassifier(n_estimators=10)
estimator = SVR(kernel="linear")
rfe = RFECV(estimator, min_features_to_select=2, cv=3)
fit = rfe.fit(X, y)

print(fit.n_features_) # features
print(fit.support_) # Selected Features:
print(fit.ranking_) #Feature Ranking


(150, 4)
2
[False False  True  True]
[2 3 1 1]


# Feature selection using SelectFromModel
More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

In [7]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

data_experiment = load_iris()
X, y = data_experiment.data, data_experiment.target
print(X.shape)

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
print('Feature importance: ', clf.feature_importances_)

model = SelectFromModel(clf, prefit=True, threshold=0.3)
X_new = model.transform(X)

print(X_new.shape)
#print(X_new.support_)

(150, 4)
Feature importance:  [0.09139317 0.06591985 0.38572401 0.45696297]
(150, 2)


# PCA

In [20]:
experiment_data = load_iris()#load_breast_cancer()

data_experiment = load_iris()
X, y = data_experiment.data, data_experiment.target

pca = PCA(n_components=2)

pca_fit = pca.fit(X)
explained_variance_

X = pca_fit.transform(X)
