In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from tqdm import tqdm

from src.data import generate_data, discretize_dataset
from src.feature_selection import CMIM, JMIM, IGFS, wrapper_criterion, l1_selection, find_relevant_features

In [2]:
X, y = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=2
)
X_discr = np.copy(X)
for i in range(X.shape[1]):
    X_discr[:, i] = pd.cut(X[:, i], bins=10, labels=False)

In [3]:
best_predictors = wrapper_criterion(X, y, 'aic')
best_predictors

[0, 1, 2, 3, 4, 5, 7, 10, 14]

In [4]:
best_predictors = wrapper_criterion(X, y, 'bic')
best_predictors

[0, 1, 2, 3, 4, 5, 7, 14]

In [5]:
best_predictors = CMIM(X_discr, y)
best_predictors

[0, 1, 2, 3, 5]

In [6]:
best_predictors = JMIM(X_discr, y)
best_predictors

[0, 1, 2, 5, 6]

In [7]:
best_predictors = IGFS(X_discr, y)
best_predictors

[0, 1, 2, 5, 6]

In [8]:
data = {}

# Divorce dataset, source https://www.kaggle.com/datasets/rabieelkharoua/split-or-stay-divorce-predictor-dataset
divorce = pd.read_csv("data/divorce.csv", sep=";")

data["divorce"] = {}
data["divorce"]["X_orig"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["X_discr"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["y"] = divorce["Class"].to_numpy()


# AIDS classification dataset, source: https://www.kaggle.com/datasets/aadarshvelu/aids-virus-infection-prediction
aids = pd.read_csv("data/aids.csv")
X_aids = aids.drop("infected", axis=1).to_numpy()

data["aids"] = {}
data["aids"]["X_orig"] = X_aids
data["aids"]["X_discr"] = discretize_dataset(X_aids)
data["aids"]["y"] = aids["infected"].to_numpy()


# RT-IoT2022 dataset, source: https://archive.ics.uci.edu/dataset/942/rt-iot2022
# iot = pd.read_csv("data/iot.csv")
# iot.Attack_type = LabelEncoder().fit_transform(iot.Attack_type)
# iot.proto = LabelEncoder().fit_transform(iot.proto).astype(int)
# iot.drop(["service", "Unnamed: 0"], axis=1, inplace=True)
# X_iot = iot.drop("Attack_type", axis=1).to_numpy()

# data["iot"] = {}
# data["iot"]["X_orig"] = X_iot
# data["iot"]["X_discr"] = discretize_dataset(X_iot)
# data["iot"]["y"] = iot["Attack_type"].to_numpy()

# LOL Diamond FF15 dataset, source: https://www.kaggle.com/datasets/jakejoeanderson/league-of-legends-diamond-matches-ff15
lol = pd.read_csv("data/lol.csv")
X_lol = lol.drop(["match_id", "blue_Win"], axis=1).to_numpy()

data["lol"] = {}
data["lol"]["X_orig"] = X_lol
data["lol"]["X_discr"] = discretize_dataset(X_lol)
data["lol"]["y"] = lol["blue_Win"].to_numpy()


# Cancer dataset, source: https://www.kaggle.com/datasets/erdemtaha/cancer-data
cancer = pd.read_csv("data/cancer.csv")
cancer.loc[cancer["diagnosis"] == "M", "diagnosis"] = 0
cancer.loc[cancer["diagnosis"] == "B", "diagnosis"] = 1
X_cancer = cancer.drop(["id", "diagnosis", "Unnamed: 32"], axis=1).to_numpy()

data["cancer"] = {}
data["cancer"]["X_orig"] = X_cancer
data["cancer"]["X_discr"] = discretize_dataset(X_cancer)
data["cancer"]["y"] = cancer["diagnosis"].to_numpy().astype(int)

# Generated data
X_gen, y_gen = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=0,
    n_classes=2,
)

data["generated"] = {}
data["generated"]["X_orig"] = X_gen
data["generated"]["X_discr"] = discretize_dataset(X_gen)
data["generated"]["y"] = y_gen

In [9]:
datasets = [key for key, _ in data.items()]
relevant_features = {}

for key in datasets:
    print(f"Finding relevant features for {key} dataset...")
    relevant_features[key] = find_relevant_features(data[key]["X_discr"], data[key]["y"])

Finding relevant features for divorce dataset...
Calculations completed!
Finding relevant features for aids dataset...
Calculations completed!
Finding relevant features for lol dataset...
Calculations completed!
Finding relevant features for cancer dataset...
Calculations completed!
Finding relevant features for generated dataset...
Calculations completed!


In [10]:
relevant_features

{'divorce': {'BIC': [5, 17, 28, 39],
  'CMIM': [0, 10, 16, 39, 51],
  'JMIM': [10, 17, 19, 25, 39],
  'IGFS': [10, 16, 17, 19, 39],
  'L1': [17, 18]},
 'aids': {'BIC': [0, 2, 11, 14, 15, 17, 19, 21],
  'CMIM': [0, 2, 10, 17, 19],
  'JMIM': [0, 10, 17, 18, 19],
  'IGFS': [0, 10, 17, 18, 19],
  'L1': [0, 2, 4, 14, 15, 17, 19]},
 'lol': {'BIC': [43, 82],
  'CMIM': [0, 34, 43, 75, 76],
  'JMIM': [0, 37, 38, 43, 81],
  'IGFS': [37, 38, 43, 81, 82],
  'L1': [43]},
 'cancer': {'BIC': [2, 10, 13, 14, 15, 20, 21, 23, 26, 27, 28],
  'CMIM': [0, 9, 21, 22, 27],
  'JMIM': [7, 20, 21, 22, 27],
  'IGFS': [6, 7, 20, 22, 27],
  'L1': [1, 6, 22, 27]},
 'generated': {'BIC': [0, 1, 2, 3, 4],
  'CMIM': [0, 1, 2, 3, 6],
  'JMIM': [0, 1, 2, 3, 4],
  'IGFS': [0, 1, 2, 3, 27],
  'L1': [0, 1, 2, 3, 4, 14, 33]}}

In [11]:
accuracy_scores = {}
for dataset, _ in relevant_features.items():
    accuracy_scores[dataset] = {}
    for method, val in tqdm(relevant_features[dataset].items(), f"Processing dataset {dataset}"):
        X = data[dataset]["X_orig"][:, val]
        y = data[dataset]["y"]
        clf = GradientBoostingClassifier()
        accuracy_scores[dataset][method] = np.mean(cross_val_score(clf, X, y, cv=3))

    clf = GradientBoostingClassifier()
    accuracy_scores[dataset]["full_data"] = np.mean(cross_val_score(
        clf, 
        data[dataset]["X_orig"],
        data[dataset]["y"],
        cv=3
    ))

Processing dataset divorce: 100%|██████████| 5/5 [00:00<00:00,  7.91it/s]
Processing dataset aids: 100%|██████████| 5/5 [00:02<00:00,  2.14it/s]
Processing dataset lol: 100%|██████████| 5/5 [00:06<00:00,  1.34s/it]
Processing dataset cancer: 100%|██████████| 5/5 [00:01<00:00,  3.59it/s]
Processing dataset generated: 100%|██████████| 5/5 [00:01<00:00,  2.57it/s]


In [12]:
accuracy_scores

{'divorce': {'BIC': 0.9348370927318296,
  'CMIM': 0.946846282372598,
  'JMIM': 0.9231411862990809,
  'IGFS': 0.9348370927318296,
  'L1': 0.9348370927318296,
  'full_data': 0.9231411862990809},
 'aids': {'BIC': 0.8868630201028518,
  'CMIM': 0.8906030855539973,
  'JMIM': 0.8831229546517063,
  'IGFS': 0.8831229546517063,
  'L1': 0.884992987377279,
  'full_data': 0.8873305282842449},
 'lol': {'BIC': 1.0,
  'CMIM': 1.0,
  'JMIM': 1.0,
  'IGFS': 1.0,
  'L1': 1.0,
  'full_data': 1.0},
 'cancer': {'BIC': 0.9543024227234754,
  'CMIM': 0.9420124385036664,
  'JMIM': 0.9578204771187228,
  'IGFS': 0.9420124385036664,
  'L1': 0.945521210433491,
  'full_data': 0.9595655806182122},
 'generated': {'BIC': 0.9369998741256227,
  'CMIM': 0.9239958521395647,
  'JMIM': 0.9369998741256227,
  'IGFS': 0.9239958521395647,
  'L1': 0.9340058621495748,
  'full_data': 0.9150108192024359}}