### Feature Selection

### Classification models

In [7]:
import os

In [8]:
# use LASSO to eliminate some of the features

import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# load the data
from pathlib import Path
from src.settings import DATA_DIR

# Read the text file into a dataframe
X = pd.read_csv(os.path.join(DATA_DIR, 'x_train.txt'), sep=' ', header=None).to_numpy()
y = pd.read_csv(os.path.join(DATA_DIR, 'y_train.txt'), header=None).to_numpy().T[0]
X_test = pd.read_csv(os.path.join(DATA_DIR, 'x_test.txt'), sep=' ', header=None).to_numpy()

In [78]:
# Use LASSO to extract the most important features

from sklearn.linear_model import ElasticNetCV

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# use lasso
lasso = ElasticNetCV(cv=5)
lasso.fit(X, y)
# get the most important features
lasso_important_features = np.where(lasso.coef_ != 0)[0]
len(lasso_important_features)

29

In [82]:
from src.custom_feature_selectors.manual_feature_selector import ManualFeatureSelector
from src.experiment import Experiment
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


experiment_config = [
     Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": lasso_important_features
        },
    )
    # Experiment(
    #     classifier=RandomForestClassifier,
    #     classifier_config={
    #         "n_estimators": 100
    #     },
    #     feature_selector=ManualFeatureSelector,
    #     feature_selector_config={
    #         "indices": lasso_important_features
    #     },
    # )
]

In [83]:
from src.experiment_utils import perform_experiments

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_mfs_287d81 in progress...
Using 29 features, we properly classified 115/200 clients.
Using 29 features, we properly classified 100/200 clients.
Using 29 features, we properly classified 115/200 clients.
Using 29 features, we properly classified 112/200 clients.
Using 29 features, we properly classified 116/200 clients.
{'exp_svc_mfs_287d81': -220}


In [72]:
indices

{'exp_rfc_mfs_77d097': [array([ 64, 155, 220, 285, 323, 335, 403, 498], dtype=int64),
  array([ 64, 155, 220, 285, 323, 335, 403, 498], dtype=int64),
  array([ 64, 155, 220, 285, 323, 335, 403, 498], dtype=int64),
  array([ 64, 155, 220, 285, 323, 335, 403, 498], dtype=int64),
  array([ 64, 155, 220, 285, 323, 335, 403, 498], dtype=int64)]}

In [25]:
indices

{'exp_gnb_mfs_b27b05': [array([ 13,  20,  21,  24,  29,  34,  35,  47,  57,  60,  64,  66,  67,
          75,  80,  84,  98, 105, 139, 148, 152, 155, 175, 178, 184, 191,
         198, 199, 200, 203, 204, 206, 214, 215, 220, 225, 227, 240, 249,
         252, 253, 266, 273, 277, 281, 285, 288, 296, 303, 306, 309, 316,
         321, 322, 323, 324, 327, 328, 335, 339, 351, 357, 359, 360, 380,
         403, 409, 412, 415, 445, 458, 462, 466, 470, 471, 481, 489, 495,
         498], dtype=int64),
  array([ 13,  20,  21,  24,  29,  34,  35,  47,  57,  60,  64,  66,  67,
          75,  80,  84,  98, 105, 139, 148, 152, 155, 175, 178, 184, 191,
         198, 199, 200, 203, 204, 206, 214, 215, 220, 225, 227, 240, 249,
         252, 253, 266, 273, 277, 281, 285, 288, 296, 303, 306, 309, 316,
         321, 322, 323, 324, 327, 328, 335, 339, 351, 357, 359, 360, 380,
         403, 409, 412, 415, 445, 458, 462, 466, 470, 471, 481, 489, 495,
         498], dtype=int64),
  array([ 13,  20,  21,  24,  29

In [16]:
from src.experiment_utils import calculate_score

ImportError: cannot import name 'calculate_score' from 'src.experiment_utils' (c:\Users\barto\OneDrive\Dokumenty\DS2024L\AML\Projects\Project 2\AML-2024L-Offer-Acceptance-Prediction\src\experiment_utils.py)

In [None]:
lasso_important_features

array([ 13,  20,  21,  24,  29,  34,  35,  47,  57,  60,  64,  66,  67,
        75,  80,  84,  98, 105, 139, 148, 152, 155, 175, 178, 184, 191,
       198, 199, 200, 203, 204, 206, 214, 215, 220, 225, 227, 240, 249,
       252, 253, 266, 273, 277, 281, 285, 288, 296, 303, 306, 309, 316,
       321, 322, 323, 324, 327, 328, 335, 339, 351, 357, 359, 360, 380,
       403, 409, 412, 415, 445, 458, 462, 466, 470, 471, 481, 489, 495,
       498], dtype=int64)

PCA

In [85]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:10]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_mfs_5e75ee in progress...
Using 10 features, we properly classified 151/200 clients.
Using 10 features, we properly classified 141/200 clients.
Using 10 features, we properly classified 145/200 clients.
Using 10 features, we properly classified 152/200 clients.
Using 10 features, we properly classified 149/200 clients.
{'exp_svc_mfs_5e75ee': 5380}


In [86]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:7]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_mfs_54150a in progress...
Using 7 features, we properly classified 149/200 clients.
Using 7 features, we properly classified 138/200 clients.
Using 7 features, we properly classified 148/200 clients.
Using 7 features, we properly classified 147/200 clients.
Using 7 features, we properly classified 144/200 clients.
{'exp_svc_mfs_54150a': 5860}


In [87]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:5]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_mfs_7b2914 in progress...
Using 5 features, we properly classified 146/200 clients.
Using 5 features, we properly classified 136/200 clients.
Using 5 features, we properly classified 142/200 clients.
Using 5 features, we properly classified 138/200 clients.
Using 5 features, we properly classified 132/200 clients.
{'exp_svc_mfs_7b2914': 5940}


In [88]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:4]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_mfs_36dea4 in progress...
Using 4 features, we properly classified 142/200 clients.
Using 4 features, we properly classified 134/200 clients.
Using 4 features, we properly classified 133/200 clients.
Using 4 features, we properly classified 137/200 clients.
Using 4 features, we properly classified 127/200 clients.
{'exp_svc_mfs_36dea4': 5930}


In [90]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:6]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100,
            "max_depth": 10
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_4a3d5e in progress...
Using 6 features, we properly classified 145/200 clients.
Using 6 features, we properly classified 129/200 clients.
Using 6 features, we properly classified 139/200 clients.
Using 6 features, we properly classified 134/200 clients.
Using 6 features, we properly classified 132/200 clients.
{'exp_rfc_mfs_4a3d5e': 5590}


In [91]:
# PCA to select the most important features
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_important_features = np.argsort(pca.explained_variance_)[::-1][:6]
len(pca_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": pca_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_4a467c in progress...
Using 6 features, we properly classified 145/200 clients.
Using 6 features, we properly classified 126/200 clients.
Using 6 features, we properly classified 134/200 clients.
Using 6 features, we properly classified 136/200 clients.
Using 6 features, we properly classified 130/200 clients.
{'exp_rfc_mfs_4a467c': 5510}


In [93]:
# ICA to select the most important features
from sklearn.decomposition import FastICA

ica = FastICA(n_components=100)
ica.fit(X)
ica_important_features = np.argsort(np.abs(ica.components_).sum(axis=0))[::-1][:6]
len(ica_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": ica_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)



Experiment exp_rfc_mfs_bc244c in progress...
Using 6 features, we properly classified 143/200 clients.
Using 6 features, we properly classified 141/200 clients.
Using 6 features, we properly classified 148/200 clients.
Using 6 features, we properly classified 142/200 clients.
Using 6 features, we properly classified 147/200 clients.
{'exp_rfc_mfs_bc244c': 6010}


In [94]:
# ICA to select the most important features
from sklearn.decomposition import FastICA

ica = FastICA(n_components=100)
ica.fit(X)
ica_important_features = np.argsort(np.abs(ica.components_).sum(axis=0))[::-1][:5]
len(ica_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": ica_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)



Experiment exp_rfc_mfs_7d6d1a in progress...
Using 5 features, we properly classified 140/200 clients.
Using 5 features, we properly classified 144/200 clients.
Using 5 features, we properly classified 142/200 clients.
Using 5 features, we properly classified 133/200 clients.
Using 5 features, we properly classified 143/200 clients.
{'exp_rfc_mfs_7d6d1a': 6020}


In [96]:
# ICA to select the most important features
from sklearn.decomposition import FastICA

ica = FastICA(n_components=100)
ica.fit(X)
ica_important_features = np.argsort(np.abs(ica.components_).sum(axis=0))[::-1][:4]
len(ica_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": ica_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)



Experiment exp_rfc_mfs_21265a in progress...
Using 4 features, we properly classified 134/200 clients.
Using 4 features, we properly classified 137/200 clients.
Using 4 features, we properly classified 128/200 clients.
Using 4 features, we properly classified 140/200 clients.
Using 4 features, we properly classified 136/200 clients.
{'exp_rfc_mfs_21265a': 5950}


In [101]:
# t-SNE to select the most important features
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
tsne.fit(X)
tsne_important_features = np.argsort(np.abs(tsne.embedding_).sum(axis=0))[::-1][:10]
len(tsne_important_features)

experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": tsne_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)

Experiment exp_rfc_mfs_f74492 in progress...
Using 2 features, we properly classified 111/200 clients.
Using 2 features, we properly classified 107/200 clients.
Using 2 features, we properly classified 113/200 clients.
Using 2 features, we properly classified 115/200 clients.
Using 2 features, we properly classified 105/200 clients.


In [106]:
# use autoencoder to select the most important features
from sklearn.neural_network import MLPRegressor

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 50, 50), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:7]
len(autoencoder_important_features)


experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_ebea7b in progress...
Using 7 features, we properly classified 150/200 clients.
Using 7 features, we properly classified 143/200 clients.
Using 7 features, we properly classified 141/200 clients.
Using 7 features, we properly classified 141/200 clients.
Using 7 features, we properly classified 151/200 clients.
{'exp_rfc_mfs_ebea7b': 5860}


In [107]:
# use autoencoder to select the most important features
from sklearn.neural_network import MLPRegressor

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 100, 50, 50), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:5]
len(autoencoder_important_features)


experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_ba9a53 in progress...
Using 5 features, we properly classified 134/200 clients.
Using 5 features, we properly classified 138/200 clients.
Using 5 features, we properly classified 133/200 clients.
Using 5 features, we properly classified 133/200 clients.
Using 5 features, we properly classified 128/200 clients.
{'exp_rfc_mfs_ba9a53': 5660}


In [108]:
# use autoencoder to select the most important features
from sklearn.neural_network import MLPRegressor

autoencoder = MLPRegressor(hidden_layer_sizes=(100, 100, 50, 50, 25), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:6]
len(autoencoder_important_features)


experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_6b4f28 in progress...
Using 6 features, we properly classified 148/200 clients.
Using 6 features, we properly classified 147/200 clients.
Using 6 features, we properly classified 136/200 clients.
Using 6 features, we properly classified 134/200 clients.
Using 6 features, we properly classified 140/200 clients.
{'exp_rfc_mfs_6b4f28': 5850}


In [109]:
# use autoencoder to select the most important features
from sklearn.neural_network import MLPRegressor

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 100, 50, 10), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:5]
len(autoencoder_important_features)


experiment_config = [
    Experiment(
        classifier=RandomForestClassifier,
        classifier_config={
            "n_estimators": 100
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_rfc_mfs_12db62 in progress...
Using 5 features, we properly classified 127/200 clients.
Using 5 features, we properly classified 123/200 clients.
Using 5 features, we properly classified 132/200 clients.
Using 5 features, we properly classified 120/200 clients.
Using 5 features, we properly classified 123/200 clients.
{'exp_rfc_mfs_12db62': 5250}


In [110]:
# use autoencoder to select the most important features
# and QDA for classification
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 50, 50), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:7]
len(autoencoder_important_features)

experiment_config = [
    Experiment(
        classifier=QuadraticDiscriminantAnalysis,
        classifier_config={
            "reg_param": 0.1
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_qda_mfs_8c2bab in progress...
Using 7 features, we properly classified 159/200 clients.
Using 7 features, we properly classified 152/200 clients.
Using 7 features, we properly classified 152/200 clients.
Using 7 features, we properly classified 155/200 clients.
Using 7 features, we properly classified 155/200 clients.
{'exp_qda_mfs_8c2bab': 6330}


In [113]:
# use autoencoder to select the most important features
# and QDA for classification
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 50, 50), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:5]
len(autoencoder_important_features)

experiment_config = [
    Experiment(
        classifier=QuadraticDiscriminantAnalysis,
        classifier_config={
            "reg_param": 0.18
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_qda_mfs_fb5817 in progress...
Using 5 features, we properly classified 149/200 clients.
Using 5 features, we properly classified 151/200 clients.
Using 5 features, we properly classified 147/200 clients.
Using 5 features, we properly classified 148/200 clients.
Using 5 features, we properly classified 151/200 clients.
{'exp_qda_mfs_fb5817': 6460}


In [115]:
# use autoencoder to select the most important features
# and QDA for classification
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

autoencoder = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 50, 50), max_iter=1000)
autoencoder.fit(X, X)
autoencoder_important_features = np.argsort(np.abs(autoencoder.coefs_[0]).sum(axis=1))[::-1][:6]
len(autoencoder_important_features)

experiment_config = [
    Experiment(
        classifier=QuadraticDiscriminantAnalysis,
        classifier_config={
            "reg_param": 0.05
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": autoencoder_important_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_qda_mfs_1210a2 in progress...
Using 6 features, we properly classified 149/200 clients.
Using 6 features, we properly classified 146/200 clients.
Using 6 features, we properly classified 147/200 clients.
Using 6 features, we properly classified 149/200 clients.
Using 6 features, we properly classified 152/200 clients.
{'exp_qda_mfs_1210a2': 6230}


In [116]:
indices

{'exp_qda_mfs_1210a2': [array([100, 109, 106, 103, 102, 464], dtype=int64),
  array([100, 109, 106, 103, 102, 464], dtype=int64),
  array([100, 109, 106, 103, 102, 464], dtype=int64),
  array([100, 109, 106, 103, 102, 464], dtype=int64),
  array([100, 109, 106, 103, 102, 464], dtype=int64)]}

In [130]:
selected_features = [0, 2, 21, 27]

# use autoencoder to select the most important features
# and QDA for classification
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

experiment_config = [
    Experiment(
        classifier=QuadraticDiscriminantAnalysis,
        classifier_config={
            "reg_param": 0
        },
        feature_selector=ManualFeatureSelector,
        feature_selector_config={
            "indices": selected_features
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_qda_mfs_18217d in progress...
Using 4 features, we properly classified 124/200 clients.
Using 4 features, we properly classified 121/200 clients.
Using 4 features, we properly classified 126/200 clients.
Using 4 features, we properly classified 124/200 clients.
Using 4 features, we properly classified 105/200 clients.
{'exp_qda_mfs_18217d': 5200}


# Custom Feature Selectors

In [4]:
from numpy import ndarray
from src.feature_selector import BaseFeatureSelector

class MLPFeatureSelector(BaseFeatureSelector):

    def __init__(self) -> None:
        self.model = MLPFeatureSelector(hidden_layer_sizes=(200, 100, 50, 50, 50), max_iter=1000)
        self.top_k = 5
        self._fitted = False

    def fit(self, X, y):
        self.X = X
        self.model.fit(X, y)
        self._fitted = True
    
    def get_support(self, indices: bool = True) -> ndarray:
        if not self._fitted:
            raise ValueError("The model is not fitted")
        if indices:
            return np.argsort(np.abs(self.model.coefs_[0]).sum(axis=1))[::-1][:self.top_k]
        mask = np.zeros(self.X.shape[1], dtype=bool)
        mask[self.get_support(indices=True)] = True
        return mask

    def transform(self, X) -> ndarray:
        return X[:, self.get_support()]

In [19]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from src.feature_selector import BaseFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

class CombinedFeatureSelector(BaseFeatureSelector):

    def __init__(self, top_k=10, alpha=0.01, n_features_to_select=10) -> None:
        self.top_k = top_k
        self.alpha = alpha
        self.n_features_to_select = n_features_to_select
        self._fitted = False

    def fit(self, X, y):
        # Step 1: Univariate feature selection (Filter method)
        self.select_k_best = SelectKBest(score_func=f_classif, k=self.top_k)
        self.select_k_best.fit(X, y)
        self.k_best_indices = self.select_k_best.get_support(indices=True)

        # model = KNeighborsClassifier(n_neighbors=3)
        model = GradientBoostingClassifier()
        # Step 2: Recursive Feature Elimination (Wrapper method)
        self.rfe_model = RFE(estimator=model, n_features_to_select=self.n_features_to_select)
        self.rfe_model.fit(X[:, self.k_best_indices], y)
        self.rfe_indices = self.k_best_indices[self.rfe_model.get_support(indices=True)]

        # Step 3: Lasso feature selection (Embedded method)
        self.lasso = Lasso(alpha=self.alpha)
        self.lasso.fit(X[:, self.rfe_indices], y)
        self.lasso_indices = self.rfe_indices[np.abs(self.lasso.coef_) > 0]

        self.selected_features_ = self.lasso_indices
        self._fitted = True
    
    def get_support(self, indices: bool = True) -> np.ndarray:
        if not self._fitted:
            raise ValueError("The model is not fitted")
        if indices:
            return self.selected_features_
        mask = np.zeros(X.shape[1], dtype=bool)
        mask[self.selected_features_] = True
        return mask

    def transform(self, X) -> np.ndarray:
        return X[:, self.get_support()]

# Example usage:
# selector = CombinedFeatureSelector(top_k=20, alpha=0.01, n_features_to_select=10)
# selector.fit(X_train, y_train)
# X_train_selected = selector.transform(X_train)
# X_test_selected = selector.transform(X_test)


In [20]:
# perform experiments with the combined feature selector
from src.experiment import Experiment
from src.experiment_utils import perform_experiments
from sklearn.svm import SVC


experiment_config = [
    Experiment(
        classifier=SVC,
        classifier_config={
            "probability": True,
        },
        feature_selector=CombinedFeatureSelector,
        feature_selector_config={
            "top_k": 50,
            "alpha": 0.003,
            "n_features_to_select": 8
        },
    )
]

scores, indices = perform_experiments(X, y, experiment_config)
print(scores)

Experiment exp_svc_cfs_b2918f in progress...
Using 8 features, we properly classified 91/200 clients.
Using 8 features, we properly classified 98/200 clients.
Using 8 features, we properly classified 102/200 clients.
Using 8 features, we properly classified 95/200 clients.
Using 8 features, we properly classified 119/200 clients.
{'exp_svc_cfs_b2918f': 3450}


In [None]:
# choose sklearn different implementations of feature selection

feature_selectors = [
    ManualFeatureSelector,
    PCA,
    FastICA,
    TSNE,
    MLPRegressor
]