In [28]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mutual_info_score as MI
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import (
    SelectFromModel,
    SequentialFeatureSelector
)

from src.data import generate_data
from src.feature_selection import CMIM, JMIM, IGFS, wrapper_criterion

In [2]:
X, y = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=2
)
X_discr = np.copy(X)
for i in range(X.shape[1]):
    X_discr[:, i] = pd.cut(X[:, i], bins=10, labels=False)

In [3]:
best_predictors = wrapper_criterion(X, y, 'aic')
best_predictors

[0, 1, 2, 3, 4, 11, 13]

In [4]:
best_predictors = wrapper_criterion(X, y, 'bic')
best_predictors

[0, 1, 2, 3, 4, 13]

In [5]:
best_predictors = CMIM(X_discr, y)
best_predictors

[0, 1, 2, 3, 5]

In [6]:
best_predictors = JMIM(X_discr, y)
best_predictors

[0, 1, 2, 3, 5]

In [7]:
best_predictors = IGFS(X_discr, y)
best_predictors

[0, 1, 2, 5, 6]

In [12]:
sfs_model = LogisticRegression(max_iter=1000, penalty="l1", solver="liblinear").fit(X, y)
# sfs_model = RandomForestClassifier(max_depth=3).fit(X, y)
sfs_forward = SequentialFeatureSelector(
    sfs_model, n_features_to_select="auto", tol=0.001, direction="forward"
).fit(X, y)
np.argwhere(sfs_forward.get_support()*1 > 0).T[0]

array([ 0,  1,  2,  3,  4, 22])

In [228]:
data = {}

# Divorce dataset, source https://www.kaggle.com/datasets/rabieelkharoua/split-or-stay-divorce-predictor-dataset
divorce = pd.read_csv("data/divorce.csv", sep=";")

data["divorce"] = {}
data["divorce"]["X_orig"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["X_discr"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["y"] = divorce["Class"].to_numpy()


# AIDS classification dataset, source: https://www.kaggle.com/datasets/aadarshvelu/aids-virus-infection-prediction
aids = pd.read_csv("data/aids.csv")
X_aids = aids.drop("infected", axis=1).to_numpy()

data["aids"] = {}
data["aids"]["X_orig"] = X_aids
data["aids"]["X_discr"] = discretize_dataset(X_aids)
data["aids"]["y"] = aids["infected"].to_numpy()


# RT-IoT2022 dataset, source: https://archive.ics.uci.edu/dataset/942/rt-iot2022
# iot = pd.read_csv("data/iot.csv")
# iot.Attack_type = LabelEncoder().fit_transform(iot.Attack_type)
# iot.proto = LabelEncoder().fit_transform(iot.proto).astype(int)
# iot.drop(["service", "Unnamed: 0"], axis=1, inplace=True)
# X_iot = iot.drop("Attack_type", axis=1).to_numpy()

# data["iot"] = {}
# data["iot"]["X_orig"] = X_iot
# data["iot"]["X_discr"] = discretize_dataset(X_iot)
# data["iot"]["y"] = iot["Attack_type"].to_numpy()

# LOL Diamond FF15 dataset, source: https://www.kaggle.com/datasets/jakejoeanderson/league-of-legends-diamond-matches-ff15
lol = pd.read_csv("data/lol.csv")
X_lol = lol.drop(["match_id", "blue_Win"], axis=1).to_numpy()

data["lol"] = {}
data["lol"]["X_orig"] = X_lol
data["lol"]["X_discr"] = discretize_dataset(X_lol)
data["lol"]["y"] = lol["blue_Win"].to_numpy()


# Cancer dataset, source: https://www.kaggle.com/datasets/erdemtaha/cancer-data
cancer = pd.read_csv("data/cancer.csv")
cancer.loc[cancer["diagnosis"] == "M", "diagnosis"] = 0
cancer.loc[cancer["diagnosis"] == "B", "diagnosis"] = 1
X_cancer = cancer.drop(["id", "diagnosis", "Unnamed: 32"], axis=1).to_numpy()

data["cancer"] = {}
data["cancer"]["X_orig"] = X_cancer
data["cancer"]["X_discr"] = discretize_dataset(X_cancer)
data["cancer"]["y"] = cancer["diagnosis"].to_numpy().astype(int)

# Generated data
X_gen, y_gen = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=0,
    n_classes=2,
)

data["generated"] = {}
data["generated"]["X_orig"] = X_gen
data["generated"]["X_discr"] = discretize_dataset(X_gen)
data["generated"]["y"] = y_gen

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.20,...,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02,1
1,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.90,0.65,...,0.100,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05,1
2,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,...,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01,0
3,1.36,11.33,0.04,2.96,0.001,7.23,0.03,1.66,1.08,0.71,...,0.016,1.41,1.29,0.004,9.12,1.72,0.02,0.45,0.05,1
4,0.92,24.33,0.03,0.20,0.006,2.67,0.69,0.57,0.61,0.13,...,0.117,6.74,1.11,0.003,16.90,2.41,0.02,0.06,0.02,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7994,0.05,7.78,0.00,1.95,0.040,0.10,0.03,0.03,1.37,0.00,...,0.197,14.29,1.00,0.005,3.57,2.13,0.09,0.06,0.03,1
7995,0.05,24.22,0.02,0.59,0.010,0.45,0.02,0.02,1.48,0.00,...,0.031,10.27,1.00,0.001,1.48,1.11,0.09,0.10,0.08,1
7996,0.09,6.85,0.00,0.61,0.030,0.05,0.05,0.02,0.91,0.00,...,0.182,15.92,1.00,0.000,1.35,4.84,0.00,0.04,0.05,1
7997,0.01,10,0.01,2.00,0.000,2.00,0.00,0.09,0.00,0.00,...,0.000,0.00,0.00,0.000,0.00,0.00,0.00,0.00,0.00,1


In [None]:
datasets = [key for key, _ in data.items()]
relevant_features = {}

for key in datasets:
    print(f"Finding relevant features for {key} dataset...")
    relevant_features[key] = find_relevant_features(data[key]["X_discr"], data[key]["y"])

In [42]:
accuracy_scores = {}
for dataset, _ in relevant_features.items():
    accuracy_scores[dataset] = {}
    for method, val in tqdm(relevant_features[dataset].items(), f"Processing dataset {dataset}"):
        X = data[dataset]["X_orig"][:, val]
        y = data[dataset]["y"]
        clf = GradientBoostingClassifier()
        accuracy_scores[dataset][method] = np.mean(cross_val_score(clf, X, y, cv=3))

    clf = GradientBoostingClassifier()
    accuracy_scores[dataset]["full_data"] = np.mean(cross_val_score(
        clf, 
        data[dataset]["X_orig"],
        data[dataset]["y"],
        cv=3
    ))

Processing dataset divorce: 100%|██████████| 5/5 [00:00<00:00,  5.36it/s]
Processing dataset aids: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing dataset lol: 100%|██████████| 5/5 [00:12<00:00,  2.53s/it]
Processing dataset cancer: 100%|██████████| 5/5 [00:03<00:00,  1.26it/s]
Processing dataset generated: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
