# 1. Import Libraries


In [10]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso
import numpy as np


# 2. Load and Preprocess the Data

In [11]:
# Load the dataset from a CSV file
df = pd.read_csv('Iris.csv')  # Replace 'iris.csv' with your actual CSV file path

# Preview the first few rows of the dataset
print(df.head())

# Convert 'Species' column to a categorical variable
df['Species'] = pd.factorize(df['Species'])[0]  # Convert categories to numerical values

# Define the features (X) and target (y)
X = df.drop(columns=['Species'])  # Features: all columns except 'Species'
y = df['Species']  # Target: 'Species'

# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print dataset shapes
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
Training set size: (105, 5), Testing set size: (45, 5)


# 3. Feature Selection

In [12]:
# Apply SelectKBest (ANOVA F-value) to select the top 2 features
select_k_best = SelectKBest(score_func=f_classif, k=2)
X_train_selected = select_k_best.fit_transform(X_train_scaled, y_train)
X_test_selected = select_k_best.transform(X_test_scaled)

# Get selected feature indices
selected_features = select_k_best.get_support(indices=True)
print(f"Selected feature indices: {selected_features}")

# Get F-value scores for all features
f_scores = select_k_best.scores_
print(f"Feature scores: {f_scores}")

# Print selected feature names (optional, if column names exist)
feature_names = X.columns[selected_features]
print(f"Selected features: {feature_names.tolist()}")


Selected feature indices: [3 4]
Feature scores: [343.23576754  74.7572012   31.68871543 712.3739871  526.52691616]
Selected features: ['PetalLengthCm', 'PetalWidthCm']


# Wrapper Method (Recursive Feature Elimination - RFE)

In [13]:
# Apply RFE with Logistic Regression to select the top 2 features
rfe = RFE(estimator=LogisticRegression(max_iter=10000), n_features_to_select=2)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)  # Fit and transform training data
X_test_rfe = rfe.transform(X_test_scaled)  # Transform test data

# Get selected feature indices
selected_features_rfe = rfe.get_support(indices=True)
print(f"Selected feature indices (RFE): {selected_features_rfe}")

# Print selected feature names (optional, if column names exist)
feature_names_rfe = X.columns[selected_features_rfe]
print(f"Selected features (RFE): {feature_names_rfe.tolist()}")


Selected feature indices (RFE): [0 3]
Selected features (RFE): ['Id', 'PetalLengthCm']


# Embedded Method (Lasso Regression)

In [15]:
# Apply Lasso Regression for feature selection
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)

# Get the coefficients
lasso_coefficients = lasso.coef_
print(f"Lasso Coefficients: {lasso_coefficients}")

# Select features with non-zero coefficients
selected_features_lasso = np.where(lasso_coefficients != 0)[0]
print(f"Selected feature indices (Lasso): {selected_features_lasso}")

# Print selected feature names (optional, if column names exist)
feature_names_lasso = X.columns[selected_features_lasso]
print(f"Selected features (Lasso): {feature_names_lasso.tolist()}")


Lasso Coefficients: [ 0.30344144  0.         -0.02792404  0.22302779  0.2628675 ]
Selected feature indices (Lasso): [0 2 3 4]
Selected features (Lasso): ['Id', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


# 4. Model Training and Evaluation

In [16]:
# Initialize Logistic Regression model
model = LogisticRegression(max_iter=10000)

# --- For SelectKBest ---
# Train the model using the selected features from SelectKBest
model.fit(X_train_selected, y_train)
y_pred_kbest = model.predict(X_test_selected)
accuracy_kbest = accuracy_score(y_test, y_pred_kbest)
print(f"Accuracy using SelectKBest: {accuracy_kbest:.4f}")

# --- For RFE ---
# Train the model using the selected features from RFE
model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print(f"Accuracy using RFE: {accuracy_rfe:.4f}")

# --- For Lasso ---
# Select features based on Lasso coefficients (non-zero coefficients)
lasso_selected_features = np.where(lasso.coef_ != 0)[0]

# Transform the training and test sets based on the Lasso selected features
X_train_lasso = X_train_scaled[:, lasso_selected_features]
X_test_lasso = X_test_scaled[:, lasso_selected_features]

# Train the model using the selected features from Lasso
model.fit(X_train_lasso, y_train)
y_pred_lasso = model.predict(X_test_lasso)
accuracy_lasso = accuracy_score(y_test, y_pred_lasso)
print(f"Accuracy using Lasso: {accuracy_lasso:.4f}")


Accuracy using SelectKBest: 1.0000
Accuracy using RFE: 1.0000
Accuracy using Lasso: 1.0000


# 5. Comparison

In [17]:
# Store the results in a DataFrame
results = pd.DataFrame({
    'Method': ['Filter (ANOVA F-value)', 'Wrapper (RFE)', 'Embedded (Lasso)'],
    'Accuracy': [accuracy_kbest, accuracy_rfe, accuracy_lasso]
})

# Display the results
print(results)


                   Method  Accuracy
0  Filter (ANOVA F-value)       1.0
1           Wrapper (RFE)       1.0
2        Embedded (Lasso)       1.0


Explanation of Code:
Import Libraries:

We begin by importing necessary libraries such as pandas, sklearn, and numpy. These libraries help us manipulate data, create machine learning models, and perform feature selection.
Load and Preprocess the Data:

We load the Iris dataset from a CSV file and convert the 'Species' column into numerical values. This is important because machine learning models require numerical data.
We then split the data into training (70%) and testing (30%) sets.
StandardScaler is applied to standardize the features. This step ensures that the data is scaled so that all features have zero mean and unit variance, which is important for certain machine learning models.
Feature Selection:

We use three different feature selection methods to select the most important features for classification:

Filter Method (ANOVA F-value): This method selects the top 2 features based on their statistical significance (F-value).
Wrapper Method (RFE): This method recursively eliminates the least important features by fitting a model at each step (using logistic regression).
Embedded Method (Lasso Regression): Lasso applies L1 regularization to the logistic regression model, forcing some feature coefficients to become zero, thus eliminating less important features.
After selecting features using each method, we print out the selected features.

Model Training and Evaluation:

We use Logistic Regression to train a model with the features selected by each method.
For each method, we make predictions on the test set and calculate the accuracy of the model.
The accuracy score tells us how well the model performs with the selected features.

Comparison of Methods:

We compare the accuracy scores from each method (SelectKBest, RFE, and Lasso) by storing the results in a DataFrame and printing it. This allows us to see which feature selection method worked best in terms of model accuracy.
Key Points to Explain:
Feature Selection helps to reduce the number of features while keeping the most important ones. This leads to faster model training and sometimes better performance.
Different Methods: We applied three different methods: Filter, Wrapper, and Embedded, each with its strengths.
Accuracy Comparison: After training the model with each method, we compare how well they performed based on the accuracy score.
This approach helps in choosing the best features for a model and can lead to more efficient and accurate predictions.