<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dimensionality Reduction Using Feature Selection
- 3 Types
        * filter(examining their statistical properties)
        * wrapper( use trial and error to find the subset of features)
        * embedded

In [1]:
import numpy as np
import pandas as pd

#Thresholding Numerical Feature Variance

In [5]:
#set of numerical features and want to filter out those with low variance
#Variance thresholding (VT)
#VT will not work when feature sets contain different units

from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()

features = iris.data
target = iris.target

thresholder = VarianceThreshold(threshold=.5)

features_high_variance = thresholder.fit_transform(features)

features_high_variance[0:5]


array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

In [3]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_std = scaler.fit_transform(features)

selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

# Thresholding Binary Feature Variance

In [6]:
#You have a set of binary categorical features and want to filter out those with low variance

from sklearn.feature_selection import VarianceThreshold

features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

#Handling Highly Correlated Features


In [9]:
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

df = pd.DataFrame(features)

corr_matrix = df.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                  k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

df.drop(df.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [10]:
df.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [12]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


#Removing Irrelevant Features for Classification

In [14]:
#You have a categorical target vector and want to remove uninformative features.
#calculate a chi-square statistic between each feature and the target vector:

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

iris = load_iris()

features = iris.data
target = iris.target

features = features.astype(int)

chi2_selctor = SelectKBest(chi2, k=2)
features_kbest = chi2_selctor.fit_transform(features, target)

print(features.shape[1])
print(features_kbest.shape[1])

4
2


In [15]:
#If the features are quantitative, compute the ANOVA F-value between each feature and the target vector:

fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

print(features.shape[1])
print(features_kbest.shape[1])


4
2


* Chi-square statistics examine the independence of two categorical vectors.
* chi-square statistics can be calculated only between two categorical vectors
* all values need to be nonnegative.

In [17]:
from sklearn.feature_selection import SelectPercentile

fvalue_selctor = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selctor.fit_transform(features, target)

print(features.shape[1])
print(features_kbest.shape[1])

4
3


#Recursively Eliminating Features

In [18]:
#recursive feature elimination (RFE) using cross-validation (CV)

import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

features, target = make_regression(n_samples=1000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=42)

ols = linear_model.LinearRegression()

rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 1.37382298, -0.32207406,  0.18867157],
       [ 0.86912795, -0.38138486,  0.51317873],
       [ 0.04637371, -0.64292497,  0.44547373],
       ...,
       [-0.16201496, -2.95233167,  0.92284027],
       [-0.20327896, -0.44189588,  1.48351033],
       [-0.49078909, -0.20870738, -0.23787474]])

In [19]:
rfecv.n_features_

3

In [20]:
rfecv.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [22]:
rfecv.ranking_

array([61, 62, 93, 48, 59, 77, 72, 43, 96, 46, 76, 26,  9, 45, 28,  6, 98,
       40, 30, 17, 38,  8,  1, 81, 15, 73, 66, 92, 35, 89, 20, 94, 88, 34,
       60, 86, 52, 39, 67,  1, 74,  4, 32, 33, 90,  3, 78, 85,  2,  7, 83,
       22, 10, 53, 54, 71, 36, 31, 23, 55, 14, 63, 68, 27, 80, 58, 75, 37,
       11, 91, 21, 24, 25, 82, 70, 64, 65, 97, 41, 44, 57, 18,  1,  5, 16,
       84, 87, 50, 19, 79, 42, 56, 29, 12, 49, 69, 13, 51, 47, 95])