Aim: Apply feature selection techniques
like variance thresholding and correlation
analysis using Python's scikit-learn library
to reduce dimensionality in a dataset.


In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
# Load iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(X, columns=iris.feature_names)

print("Original Dataset:")
print(df.head())


Original Dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [2]:
from sklearn.feature_selection import VarianceThreshold

# Apply variance thresholding
selector = VarianceThreshold(threshold=0.2)
X_high_variance = selector.fit_transform(X)

print("\nDataset after Variance Thresholding:")
print(pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support(indices=True)]).head())



Dataset after Variance Thresholding:
   sepal length (cm)  petal length (cm)  petal width (cm)
0                5.1                1.4               0.2
1                4.9                1.4               0.2
2                4.7                1.3               0.2
3                4.6                1.5               0.2
4                5.0                1.4               0.2


In [3]:
# Calculate correlation matrix
corr_matrix = pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support(indices=True)]).corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features
X_low_correlation = pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support(indices=True)]).drop(to_drop, axis=1)

print("\nDataset after removing highly correlated features:")
print(X_low_correlation.head())



Dataset after removing highly correlated features:
   sepal length (cm)  petal length (cm)
0                5.1                1.4
1                4.9                1.4
2                4.7                1.3
3                4.6                1.5
4                5.0                1.4


In [4]:
from sklearn.feature_selection import SelectKBest, f_classif

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_low_correlation, y, test_size=0.2, random_state=42)

# Configure to select the top 2 features
k_best_features = 2
selector_kbest = SelectKBest(score_func=f_classif, k=k_best_features)

# Apply the SelectKBest object to the dataset
X_train_kbest = selector_kbest.fit_transform(X_train, y_train)
X_test_kbest = selector_kbest.transform(X_test)

print(f"\nDataset after SelectKBest Feature Selection (Top {k_best_features} features):")
print(pd.DataFrame(X_train_kbest, columns=X_train.columns[selector_kbest.get_support(indices=True)]).head())



Dataset after SelectKBest Feature Selection (Top 2 features):
   sepal length (cm)  petal length (cm)
0                4.6                1.0
1                5.7                1.5
2                6.7                4.4
3                4.8                1.6
4                4.4                1.3
