# Feature selection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def variance_threshold_selection(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X.columns[selector.get_support()].tolist()

def correlation_analysis(X, threshold=0.8):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return [col for col in X.columns if col not in to_drop]

def mutual_information_selection(X, y, n_features=20):
    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores.head(n_features).index.tolist()

def random_forest_importance(X, y, n_features=20):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    importances = importances.sort_values(ascending=False)
    return importances.head(n_features).index.tolist()

def plot_feature_importances(importances, title):
    plt.figure(figsize=(10, 6))
    importances.plot(kind='bar')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()

# Load your preprocessed data
df = pd.read_csv('fully_processed_data.csv')

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

print("Original number of features:", X.shape[1])

# 1. Variance Threshold
var_selected = variance_threshold_selection(X)
print("Features after variance threshold:", len(var_selected))

# 2. Correlation Analysis
corr_selected = correlation_analysis(X[var_selected])
print("Features after correlation analysis:", len(corr_selected))

# 3. Mutual Information
mi_selected = mutual_information_selection(X[corr_selected], y)
print("Top 20 features by mutual information:", mi_selected)

# Plot mutual information scores
mi_scores = mutual_info_classif(X[corr_selected], y)
mi_scores = pd.Series(mi_scores, index=X[corr_selected].columns).sort_values(ascending=False)
plot_feature_importances(mi_scores, 'Mutual Information Scores')

# 4. Random Forest Feature Importance
rf_selected = random_forest_importance(X[corr_selected], y)
print("Top 20 features by random forest importance:", rf_selected)

# Plot random forest feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X[corr_selected], y)
importances = pd.Series(rf.feature_importances_, index=X[corr_selected].columns).sort_values(ascending=False)
plot_feature_importances(importances, 'Random Forest Feature Importances')

# Combine selected features
combined_features = list(set(mi_selected + rf_selected))
print("Number of features after combining MI and RF:", len(combined_features))

# Create final dataset with selected features
final_df = df[combined_features + ['Class']]
final_df.to_csv('feature_selected_data.csv', index=False)

print("Feature selection completed. Selected features saved to 'feature_selected_data.csv'")

# Print final selected features
print("\nFinal selected features:")
print(combined_features)