## Feature selection and Feature extraction techniques

In [15]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification # to generate calassification data
from sklearn.feature_selection import SelectKBest, f_classif # to select top features using ANOVA F-test
from sklearn.decomposition import PCA  # to reduce dimensions based on variance
from sklearn.preprocessing import StandardScaler, MinMaxScaler # for feature scaling
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel # for feature importance selection

### Generating Synthetic Dataset

In [7]:
x, y = make_classification(
    n_samples=500,   # rows
    n_features=10,  # columns
    n_informative=5,
    n_redundant=3,
    n_repeated=2,
    random_state=42,
 )

x

array([[ 0.36539251,  2.17589077, -1.90154126, ...,  0.36539251,
        -0.57014359,  1.17088601],
       [ 0.27033394,  2.03222652,  1.85923573, ...,  0.27033394,
         0.57891323,  0.46071376],
       [-0.3114802 ,  1.82441488, -2.56662739, ..., -0.3114802 ,
         1.23175453,  0.55573185],
       ...,
       [-3.05191416, -2.67389894,  1.25297661, ..., -3.05191416,
        -2.62845792,  3.14126435],
       [ 2.58014668, -1.48541993, -0.23554459, ...,  2.58014668,
         2.50058851, -1.80960905],
       [ 0.29159801,  1.35256752,  1.30818297, ...,  0.29159801,
         1.06354347,  0.2754444 ]])

In [8]:
y

array([0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,

In [10]:
# Creating Dataframe

feature_names = [f"Feature_{i+1}" for i in range(x.shape[1])]
data = pd.DataFrame(x, columns=feature_names)

data['Target'] = y
data

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,Target
0,0.365393,2.175891,-1.901541,1.155482,-0.937128,0.365393,0.637000,0.365393,-0.570144,1.170886,0
1,0.270334,2.032227,1.859236,0.978039,-0.829560,0.270334,3.076069,0.270334,0.578913,0.460714,0
2,-0.311480,1.824415,-2.566627,2.065977,-1.791858,-0.311480,-0.097573,-0.311480,1.231755,0.555732,0
3,1.031703,-1.789377,0.971821,0.215974,0.620152,1.031703,-2.551701,1.031703,2.013369,-1.874659,1
4,0.745957,-1.738867,1.242226,-1.243438,1.625984,0.745957,-1.573781,0.745957,-0.260604,-1.067344,1
...,...,...,...,...,...,...,...,...,...,...,...
495,-1.453426,1.698842,-0.309161,0.082497,0.137802,-1.453426,2.051811,-1.453426,-0.119020,0.331507,1
496,-1.030117,0.768269,-0.679555,1.547205,-2.386968,-1.030117,1.358546,-1.030117,1.368355,0.711368,0
497,-3.051914,-2.673899,1.252977,-1.769336,-3.193075,-3.051914,3.892805,-3.051914,-2.628458,3.141264,1
498,2.580147,-1.485420,-0.235545,1.376543,-0.090310,2.580147,-4.219712,2.580147,2.500589,-1.809609,1


### Feature Selection: ANOVA F-Test

Selecting top 5 most relevant features from dataset using ANOVA F-test. ANOVA F-test checks how much each feature varies across the different classes in target y.

In [12]:
selector = SelectKBest(score_func=f_classif, k=5)
x_selected = selector.fit_transform(x, y)

selected_features = np.array(feature_names)[selector.get_support()]

print(f"Selected Features (ANOVA): {selected_features}")

Selected Features (ANOVA): ['Feature_2' 'Feature_4' 'Feature_5' 'Feature_6' 'Feature_8']


### Feature Extraction: PCA

In [19]:
# Feature Extraction
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_scaled)

print(f"PCA Components Shape: {x_pca.shape}")

PCA Components Shape: (500, 2)


### Using Feature Importance with Random Forest

This will select features based on their importance in a tree-based model — a model-based selection method.

In [21]:
model = RandomForestClassifier(random_state=42)
model.fit(x, y)

sfm = SelectFromModel(model, prefit=True)
x_rfc_selected = sfm.transform(x)

important_features = np.array(feature_names)[sfm.get_support()]
print(f"Important Features (Random Forest): {important_features}")

Important Features (Random Forest): ['Feature_4' 'Feature_5' 'Feature_10']
