<a href="https://colab.research.google.com/github/adi22s/DS-Exp/blob/main/exp_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Apply filter feature selection Techniques Part-1 using data set obtained from UCI ML repository. Information Gain ,  Pearson’s Correlation, ANOVA , Chi-square**

In [1]:
import pandas as pd


In [2]:
iris_data = pd.read_csv("Iris.csv")

**VARIANCE THRESHOLD**

In [4]:
from sklearn.feature_selection import VarianceThreshold

# Dropping the 'Id' column, as it's irrelevant for feature selection
iris_data_cleaned = iris_data.drop(columns=['Id'])

# Separating the features and target variable
X = iris_data_cleaned.drop(columns=['Species'])
y = iris_data_cleaned['Species']

# Applying Variance Threshold to remove features with low variance
# (By default, it removes features with variance=0)
variance_thresh = VarianceThreshold()
X_variance_filtered = variance_thresh.fit_transform(X)

# Get the shape of the data after variance filtering
X_variance_filtered.shape, X.columns


((150, 4),
 Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object'))

**INFORMATION GAIN AND CHI-SQUARE**

In [5]:
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest

# Encoding the target variable (Species) into numerical values for feature selection techniques
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Information Gain (Mutual Information)
mutual_info = mutual_info_classif(X, y_encoded)

# Chi-Square Test (requires non-negative values for chi2, which is satisfied by our dataset)
chi2_scores, p_values = chi2(X, y_encoded)

# Combine results into a DataFrame for easy comparison
feature_selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Mutual Information': mutual_info,
    'Chi-Square Score': chi2_scores,
    'p-value': p_values
})

feature_selection_df.sort_values(by='Mutual Information', ascending=False)


Unnamed: 0,Feature,Mutual Information,Chi-Square Score,p-value
3,PetalWidthCm,0.987331,67.244828,2.50018e-15
2,PetalLengthCm,0.979959,116.169847,5.943444e-26
0,SepalLengthCm,0.503875,10.817821,0.004476515
1,SepalWidthCm,0.243438,3.594499,0.1657542


**ANOVA**

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder

# Load the dataset
iris_data = pd.read_csv('Iris.csv')

# Drop the 'Id' column (if it exists, since it's not useful for analysis)
iris_data = iris_data.drop(columns=['Id'])

# Separate features and target variable
X = iris_data.drop(columns=['Species'])
y = iris_data['Species']

# Encode the target variable (Species) into numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Perform ANOVA using f_classif (ANOVA F-statistic between features and target)
f_values, p_values = f_classif(X, y_encoded)

# Create a DataFrame to display the F-statistic and p-values
anova_results = pd.DataFrame({
    'Feature': X.columns,
    'F-Statistic': f_values,
    'p-value': p_values
})

# Sort the features by F-Statistic in descending order
anova_results.sort_values(by='F-Statistic', ascending=False)



Unnamed: 0,Feature,F-Statistic,p-value
2,PetalLengthCm,1179.034328,3.0519759999999997e-91
3,PetalWidthCm,959.324406,4.3769569999999994e-85
0,SepalLengthCm,119.264502,1.6696690000000001e-31
1,SepalWidthCm,47.364461,1.327917e-16


**Pearson's Correlation Coefficient**

In [12]:
# Import necessary libraries
import pandas as pd

# Load the dataset
iris_data = pd.read_csv('Iris.csv')

# Drop the 'Id' and 'Species' columns since 'Species' is categorical
iris_data_numeric = iris_data.drop(columns=['Id', 'Species'])

# Compute the Pearson correlation matrix on numeric features
correlation_matrix = iris_data_numeric.corr(method='pearson')

# Display the correlation matrix
print(correlation_matrix)




               SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
SepalLengthCm       1.000000     -0.109369       0.871754      0.817954
SepalWidthCm       -0.109369      1.000000      -0.420516     -0.356544
PetalLengthCm       0.871754     -0.420516       1.000000      0.962757
PetalWidthCm        0.817954     -0.356544       0.962757      1.000000
