In [1]:
# Use statistical tests to select the best features based on univariate analysis.

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Select the top 2 features with highest scores
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X, y)

print("Selected Features (Top 2):\n", X_selected)


Selected Features (Top 2):
 [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.2]
 [1.2 0.2]
 [1.3 0.2]
 [1.4 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3 1. ]
 [4.6 1.3]
 [3.9 1.4]
 [3.5 1. ]
 [4.2 1.5]
 [4.  1. ]
 [4.7 1.4]
 [3.6 1.3]
 [4.4 1.4]
 [4.5 1.5]
 [4.1 1. ]
 [4.5 1.5]
 [3.9 1.1]
 [4.8 1.8]
 [4.  1.3]
 [4.9 1.5]
 [4.7 1.2]
 [4.3 1.3]
 [4.4 1.4]
 [4.8 1.4]
 [5.  1.7]
 [4.5 1.5]
 [3.5 1. ]
 [3.8 1.1]
 [3.7 1. ]
 [3.9 1.2]
 [5.1 1.6]
 [4.5 1.5]
 [4.5 1.6]
 [4.7 1.5]
 [4.4 1.3]
 [4.

In [None]:
# Recursively remove less important features to find the best feature subset.

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Load the Boston dataset
X, y = load_boston(return_X_y=True)

# Create a linear regression model
model = LinearRegression()

# Recursive feature elimination to select top 3 features
selector = RFE(model, n_features_to_select=3)
X_rfe = selector.fit_transform(X, y)

print("Selected Features by RFE:\n", X_rfe)

In [None]:
# Use a Random Forest model to rank features by importance.

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Train a Random Forest Classifier
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)

# Get feature importance
feature_importances = forest.feature_importances_
print("Feature Importances:\n", feature_importances)

In [None]:
# Use correlation matrix to identify and remove highly correlated features.

import pandas as pd
import numpy as np

# Example dataset
data = {'Feature1': [1, 2, 3, 4, 5],
        'Feature2': [2, 4, 6, 8, 10],  # Highly correlated with Feature1
        'Feature3': [5, 3, 4, 2, 1]}
df = pd.DataFrame(data)

# Calculate correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:\n", correlation_matrix)

# Drop features that are highly correlated (correlation > 0.8)
df_reduced = df.drop(['Feature2'], axis=1)
print("Data after Dropping Highly Correlated Feature:\n", df_reduced)