In [28]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mutual_info_score as MI
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import (
    SelectFromModel,
    SequentialFeatureSelector
)

In [5]:
def generate_data(
        n=1000, 
        n_rel=5, 
        n_irrel=30, 
        betas=[1, 2, 3, 4, 5, 6], 
        n_classes=10,
        dataset_variant=0,
    ):
    """Method generates synthetic data and target variable. 

    X1 are relevant features - first n_rel columns in X
    X2, X3, X4, and X5 are irrelevant features.

    Args:
        n: number of rows to generate
        n_rel: number of relevant features that will be generated from normal distribution
        n_iirel: number of irrelevant features that will be generated from normal distribution
        n_const: number of features with constant values
        betas: list of coefficients for calculating target variable (must be of len n_rel + 1)
            the first element is bias
        n_classes: number of classes in target variable, classes are distributed evenly
        dataset_variant: defines which type of irrelevant features will be included in dataset:
                        - 0: include irrelevant features (y is not created based on them),
                        - 1: include copy of relevant features with added gaussian noise,
                        - 2: include interactions between relevant variables.

    Returns:
        X: synthetic data
        y: target variable
    """
    assert len(betas) == (n_rel + 1), 'len of betas must be equal to (n_rel + 1)'

    # relevant features from normal distribution
    X1 = np.random.normal(0, 1, (n, n_rel))

    # target variable
    y = X1 @ np.array(betas[1:]).T + betas[0]
    y = pd.qcut(y, n_classes, labels=False) 

    if dataset_variant == 0:
        # irrelevant features from normal distribution
        X2 = np.random.normal(0, 1, (n, n_irrel))
        
        X = np.concatenate([X1, X2], axis=1)
    elif dataset_variant == 1:
        # relevant features with noise
        X3 = X1 + np.random.normal(0, 0.1, (n, n_rel))

        X = np.concatenate([X1, X3], axis=1)
    else:
        # second order interactions of relevant features
        X4 = np.empty((n,0), float)
        for i in range(n_rel - 1):
            for j in range(i + 1, n_rel):
                X4 = np.append(X4, np.expand_dims(X1[:, i] * X1[:, j], 1), axis=1)

        X = np.concatenate([X1, X4], axis=1)

    return X, y

In [6]:
def CMI(X, Y, Z):
  cmi = 0
  for z in np.unique(Z):
    cmi += MI(X[Z == z], Y[Z == z]) * (len(Z[Z == z]) / len(Z))
  return cmi

def interaction_gain(X1, X2, Y):
   joint_mi = MI(X2, Y) + CMI(X1, Y, X2)
   single_mis = MI(X1, Y) + MI(X2, Y)
   return joint_mi - single_mis

def CMIM(X, y):
  print('Calculating CMIM selection...')
  chosen_indices = []
  for iter_num in range(5):
    j_values = []
    for i in range(X.shape[1]):
        if i in chosen_indices:
            j_values.append(-10000)
            continue
        J = MI(X[:, i], y)
        max_value = -10000
        for j in chosen_indices:
            curr_value = MI(X[:, i], X[:, j]) - CMI(X[:, i], X[:, j], y)
            if curr_value > max_value:
                max_value = curr_value
        j_values.append(J - max_value)
    chosen_indices.append(np.argmax(j_values))
  
  return chosen_indices

def JMIM(X, y):
  print('Calculating JMIM selection...')
  max_mi = -10000
  first_idx = None
  for i in range(X.shape[1]):
    curr_mi = MI(X[:, i], y)
    if curr_mi > max_mi:
        first_idx = i
        max_mi = curr_mi
  
  chosen_indices = [first_idx]
  for iter_num in range(4):
    j_values = []
    for i in range(X.shape[1]):
      if i in chosen_indices:
          j_values.append(-10000)
          continue
      min_value = 10000
      for j in chosen_indices:
        curr_value = MI(X[:, j], y) + CMI(X[:, i], y, X[:, j])
        if curr_value < min_value:
            min_value = curr_value
      j_values.append(min_value)
    chosen_indices.append(np.argmax(j_values))
  
  return chosen_indices

def IGFS(X, y):
  print('Calculating IGFS selection...')
  chosen_indices = []
  for iter_num in range(5):
    j_values = []
    for i in range(X.shape[1]):
      if i in chosen_indices:
          j_values.append(-10000)
          continue
      J = MI(X[:, i], y)
      inter_gain_sum = 0
      for j in chosen_indices:
        inter_gain_sum += interaction_gain(X[:, i], X[:, j], y)
      if len(chosen_indices) != 0:
        inter_gain_sum /= len(chosen_indices)
      j_values.append(J + inter_gain_sum)
    chosen_indices.append(np.argmax(j_values))
  
  return chosen_indices

def wrapper_criterion(X, y, criterion="bic"):
    print('Calculating BIC selection...')
    k = X.shape[1]
    included = []
    best = None
    while True:
        value = []
        for i in range(k):
            if i not in included:
                model = sm.OLS(y, sm.add_constant(X[:, included + [i]])).fit()
                if criterion == "bic":
                  score_val = model.bic
                elif criterion == "aic":
                  score_val = model.aic
                value.append((score_val, i))
        if not value:
            break
        value.sort()
        new_score, new_feature = value[0]
        if best is None or new_score < best:
            included.append(new_feature)
            best = new_score
        else:
            break
    model = sm.OLS(y, sm.add_constant(X[:, included])).fit()
    return model, included

def l1_selection(X, y):
  print("Calculating L1 selection...")
  sfs_model = LogisticRegression(max_iter=1000, penalty="l1", solver="liblinear").fit(X, y)
  # sfs_model = RandomForestClassifier(max_depth=3).fit(X, y)
  sfs_forward = SequentialFeatureSelector(
      sfs_model, n_features_to_select="auto", tol=0.001, direction="forward"
  ).fit(X, y)
  return list(np.argwhere(sfs_forward.get_support()*1 > 0).T[0])

In [7]:
def discretize_dataset(X, bins=10):
    X_discr = np.copy(X)
    for i in range(X.shape[1]):
        X_discr[:, i] = pd.cut(X[:, i], bins=bins, labels=False)
    
    return X_discr

def find_relevant_features(X, y):
    relevant_features = {}
    relevant_features["BIC"] = wrapper_criterion(X, y)[1]
    relevant_features["CMIM"] = CMIM(X, y)
    relevant_features["JMIM"] = JMIM(X, y)
    relevant_features["IGFS"] = IGFS(X, y)
    relevant_features["L1"] = l1_selection(X, y)

    print("Calculations completed!")
    return relevant_features

In [23]:
data = {}

# Divorce dataset, source https://www.kaggle.com/datasets/rabieelkharoua/split-or-stay-divorce-predictor-dataset
divorce = pd.read_csv("data/divorce.csv", sep=";")

data["divorce"] = {}
data["divorce"]["X_orig"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["X_discr"] = divorce.drop("Class", axis=1).to_numpy()
data["divorce"]["y"] = divorce["Class"].to_numpy()


# AIDS classification dataset, source: https://www.kaggle.com/datasets/aadarshvelu/aids-virus-infection-prediction
aids = pd.read_csv("data/aids.csv")
X_aids = aids.drop("infected", axis=1).to_numpy()

data["aids"] = {}
data["aids"]["X_orig"] = X_aids
data["aids"]["X_discr"] = discretize_dataset(X_aids)
data["aids"]["y"] = aids["infected"].to_numpy()


# RT-IoT2022 dataset, source: https://archive.ics.uci.edu/dataset/942/rt-iot2022
# iot = pd.read_csv("data/iot.csv")
# iot.Attack_type = LabelEncoder().fit_transform(iot.Attack_type)
# iot.proto = LabelEncoder().fit_transform(iot.proto).astype(int)
# iot.drop(["service", "Unnamed: 0"], axis=1, inplace=True)
# X_iot = iot.drop("Attack_type", axis=1).to_numpy()

# data["iot"] = {}
# data["iot"]["X_orig"] = X_iot
# data["iot"]["X_discr"] = discretize_dataset(X_iot)
# data["iot"]["y"] = iot["Attack_type"].to_numpy()

# LOL Diamond FF15 dataset, source: https://www.kaggle.com/datasets/jakejoeanderson/league-of-legends-diamond-matches-ff15
lol = pd.read_csv("data/lol.csv")
X_lol = lol.drop(["match_id", "blue_Win"], axis=1).to_numpy()

data["lol"] = {}
data["lol"]["X_orig"] = X_lol
data["lol"]["X_discr"] = discretize_dataset(X_lol)
data["lol"]["y"] = lol["blue_Win"].to_numpy()


# Cancer dataset, source: https://www.kaggle.com/datasets/erdemtaha/cancer-data
cancer = pd.read_csv("data/cancer.csv")
cancer.loc[cancer["diagnosis"] == "M", "diagnosis"] = 0
cancer.loc[cancer["diagnosis"] == "B", "diagnosis"] = 1
X_cancer = cancer.drop(["id", "diagnosis", "Unnamed: 32"], axis=1).to_numpy()

data["cancer"] = {}
data["cancer"]["X_orig"] = X_cancer
data["cancer"]["X_discr"] = discretize_dataset(X_cancer)
data["cancer"]["y"] = cancer["diagnosis"].to_numpy().astype(int)

# Generated data
X_gen, y_gen = generate_data(
    betas=[6, 5, 4, 3, 2, 1],
    dataset_variant=0,
    n_classes=2,
)

data["generated"] = {}
data["generated"]["X_orig"] = X_gen
data["generated"]["X_discr"] = discretize_dataset(X_gen)
data["generated"]["y"] = y_gen

In [None]:
datasets = [key for key, _ in data.items()]
relevant_features = {}

for key in datasets:
    print(f"Finding relevant features for {key} dataset...")
    relevant_features[key] = find_relevant_features(data[key]["X_discr"], data[key]["y"])

In [42]:
accuracy_scores = {}
for dataset, _ in relevant_features.items():
    accuracy_scores[dataset] = {}
    for method, val in tqdm(relevant_features[dataset].items(), f"Processing dataset {dataset}"):
        X = data[dataset]["X_orig"][:, val]
        y = data[dataset]["y"]
        clf = GradientBoostingClassifier()
        accuracy_scores[dataset][method] = np.mean(cross_val_score(clf, X, y, cv=3))

    clf = GradientBoostingClassifier()
    accuracy_scores[dataset]["full_data"] = np.mean(cross_val_score(
        clf, 
        data[dataset]["X_orig"],
        data[dataset]["y"],
        cv=3
    ))

Processing dataset divorce: 100%|██████████| 5/5 [00:00<00:00,  5.36it/s]
Processing dataset aids: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]
Processing dataset lol: 100%|██████████| 5/5 [00:12<00:00,  2.53s/it]
Processing dataset cancer: 100%|██████████| 5/5 [00:03<00:00,  1.26it/s]
Processing dataset generated: 100%|██████████| 5/5 [00:06<00:00,  1.30s/it]
