In [32]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from tqdm import tqdm
from scipy.io import loadmat
import matplotlib.pyplot as plt
import seaborn as sns

from src.data import generate_data, discretize_dataset
from src.feature_selection import CMIM, JMIM, IGFS, wrapper_criterion, l1_selection, find_relevant_features

In [2]:
X, y = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=2
)
X_discr = np.copy(X)
for i in range(X.shape[1]):
    X_discr[:, i] = pd.cut(X[:, i], bins=10, labels=False)

In [3]:
best_predictors = wrapper_criterion(X, y, 'aic')
best_predictors

[0, 1, 2, 3, 4, 6, 9]

In [4]:
best_predictors = wrapper_criterion(X, y, 'bic')
best_predictors

[0, 1, 2, 3, 4, 6, 9]

In [5]:
best_predictors = CMIM(X_discr, y)
best_predictors

[0, 1, 2, 3, 5]

In [6]:
best_predictors = JMIM(X_discr, y)
best_predictors

[0, 1, 2, 5, 9]

In [7]:
best_predictors = IGFS(X_discr, y)
best_predictors

[0, 1, 2, 5, 9]

In [8]:
data = {}

# Divorce dataset, source https://www.kaggle.com/datasets/rabieelkharoua/split-or-stay-divorce-predictor-dataset
# divorce = pd.read_csv("data/divorce.csv", sep=";")

# data["divorce"] = {}
# data["divorce"]["X_orig"] = divorce.drop("Class", axis=1).to_numpy()
# data["divorce"]["X_discr"] = divorce.drop("Class", axis=1).to_numpy()
# data["divorce"]["y"] = divorce["Class"].to_numpy()


# AIDS classification dataset, source: https://www.kaggle.com/datasets/aadarshvelu/aids-virus-infection-prediction
aids = pd.read_csv("data/aids.csv")
X_aids = aids.drop("infected", axis=1).to_numpy()

data["aids"] = {}
data["aids"]["X_orig"] = X_aids
data["aids"]["X_discr"] = discretize_dataset(X_aids)
data["aids"]["y"] = aids["infected"].to_numpy()


# LOL Diamond FF15 dataset, source: https://www.kaggle.com/datasets/jakejoeanderson/league-of-legends-diamond-matches-ff15
# lol = pd.read_csv("data/lol.csv")
# X_lol = lol.drop(["match_id", "blue_Win"], axis=1).to_numpy()

# data["lol"] = {}
# data["lol"]["X_orig"] = X_lol
# data["lol"]["X_discr"] = discretize_dataset(X_lol)
# data["lol"]["y"] = lol["blue_Win"].to_numpy()


# Cancer dataset, source: https://www.kaggle.com/datasets/erdemtaha/cancer-data
cancer = pd.read_csv("data/cancer.csv")
cancer.loc[cancer["diagnosis"] == "M", "diagnosis"] = 0
cancer.loc[cancer["diagnosis"] == "B", "diagnosis"] = 1
X_cancer = cancer.drop(["id", "diagnosis", "Unnamed: 32"], axis=1).to_numpy()

data["cancer"] = {}
data["cancer"]["X_orig"] = X_cancer
data["cancer"]["X_discr"] = discretize_dataset(X_cancer)
data["cancer"]["y"] = cancer["diagnosis"].to_numpy().astype(int)

# Gait classification, source: https://archive.ics.uci.edu/dataset/604/gait+classification
# gait = loadmat("data/gait.mat")

# X_gait = gait["X"]
# y_gait = gait["Y"].T[0]

# inds_to_del = []
# for i in range(X_gait.shape[0]):
#     if np.sum(np.isnan(X_gait[i, :])) != 0:
#         inds_to_del.append(i)
# X_gait = np.delete(X_gait, (inds_to_del), axis=0)
# y_gait = np.delete(y_gait, (inds_to_del), axis=0)

# data["gait"] = {}
# data["gait"]["X_orig"] = X_gait
# data["gait"]["X_discr"] = discretize_dataset(X_gait)
# data["gait"]["y"] = pd.cut(y_gait, 3, labels=False)

# Generated data
X_gen, y_gen = generate_data(
    betas=[0, 1, 1, 1, 1, 1],
    dataset_variant=0,
    n_classes=2,
)

data["generated"] = {}
data["generated"]["X_orig"] = X_gen
data["generated"]["X_discr"] = discretize_dataset(X_gen)
data["generated"]["y"] = y_gen

In [9]:
datasets = [key for key, _ in data.items()]
relevant_features = {}

for key in datasets:
    print(f"Finding relevant features for {key} dataset...")
    relevant_features[key] = find_relevant_features(data[key]["X_discr"], data[key]["y"])

Finding relevant features for aids dataset...
Calculations completed!
Finding relevant features for cancer dataset...
Calculations completed!
Finding relevant features for generated dataset...
Calculations completed!


In [12]:
accuracy_scores = {}
for dataset, _ in relevant_features.items():
    accuracy_scores[dataset] = {}
    for method, val in tqdm(relevant_features[dataset].items(), f"Processing dataset {dataset}"):
        X = data[dataset]["X_orig"][:, val]
        y = data[dataset]["y"]
        clf = SVC()
        accuracy_scores[dataset][method] = cross_val_score(clf, X, y, cv=3)

    clf = SVC()
    accuracy_scores[dataset]["full_data"] = cross_val_score(
        clf, 
        data[dataset]["X_orig"],
        data[dataset]["y"],
        cv=3
    )

Processing dataset aids: 100%|██████████| 5/5 [00:01<00:00,  4.25it/s]
Processing dataset cancer: 100%|██████████| 5/5 [00:00<00:00, 58.14it/s]
Processing dataset generated: 100%|██████████| 5/5 [00:00<00:00, 21.27it/s]


In [37]:
def plot_results(df):
    """Plot boxplots showing index of succes for different feature selection methods."""
    fig, ax = plt.subplots(figsize=(6,6))

    sns.boxplot(data=df, x="names", y="scores", fill=True, gap=.1, color=(0.125, 0.125, 0.875), ax=ax)

    num_artists = len(ax.patches)
    num_lines = len(ax.lines)
    lines_per_artist = num_lines // num_artists

    # for i, artist in enumerate(ax.patches):
    #     color = artist.get_facecolor()
    #     lcolor = enlighten(color)
    #     artist.set_color(lcolor)
    #     artist.set_edgecolor(color)
    #     for j in range(lines_per_artist):
    #         ax.lines[i * lines_per_artist + j].set_color(color)

    plt.title('')
    plt.show()

In [38]:
names = []
scores = []
for method, acc in accuracy_scores["aids"].items():
    names = names + [method for i in range(acc.shape[0])]
    scores = scores + list(acc)

df = pd.DataFrame(
    {
        "scores": scores,
        "names": names
    }
)

In [None]:
plot_results(df)