In this notebook, we perform feature selection based feature importance from CatBoost. 

Note that CatBoost has been set to run on GPU to speed up training.

In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd

# Libraries for evaluation metrics and cross validation
from sklearn.model_selection import StratifiedGroupKFold

# Libraries for ensemble methods considered in the study
import catboost

In [None]:
# for reproducibility of K-fold dataset stratification and catboost model training
random_seed = 216 
# path to folder where train data after processing with merge_spectrogram_features_n_train_test_split.ipynb is stored
data_path = 'data/'

## Load data

In [None]:
# Reading the Train set
train = pd.read_parquet(data_path + 'train.parquet')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
# Vote columns corresponding to each of the classes
vote_cols = train.columns[train.columns.str.endswith('_vote')].tolist()
vote_cols

In [None]:
# Features to be used for training
FEATURES = train.columns[10:train.shape[1]].tolist()
len(FEATURES)

In [None]:
# sample weights based on total votes
weights_total_vote = [min(t/3,1) for t in train.total_votes.tolist()]
weights_total_vote = np.array(weights_total_vote)
weights_total_vote.shape

## Helper functions

In [None]:
def compute_feature_importance(train, features, classes, weights, cv = 5, random_seed = 216):
  """
    Input:
    train - Dataset to perform K-fold CV and compute feature importance
    features - name of the features
    classes - labels for the eeg classification
    weights - for weighting of sample importance based on total votes
    cv - number of folds to be used in CV
    random_seed - set random number for reproducibility
    
    Output:
    dataframe: rows corresponding to each of the feature and columns corresponding 
               to feature importance based on validation set of kth CV      
    """
  
  # for storing feature importance in each of the CV folds
  feature_importance = {}

  # StratifiedGroupKFold in order to stratify on the expert consensus and separate patient IDs between k folds
  sgkf = StratifiedGroupKFold(n_splits=cv, shuffle=True, random_state=random_seed)
  for i, (train_index, valid_index) in enumerate(sgkf.split(X = train, y = train.expert_consensus, groups = train.patient_id)):
    
    # define the training set for the ith fold
    X_train = train.loc[train_index,features].values
    y_train = train.loc[train_index,classes].values

    # Adapted from https://stackoverflow.com/questions/75762712/how-to-train-xgboost-with-probabilities-instead-of-class
    # to train using the probability values of each class in the objective function instead of expert_consensus
    n_samples, n_classes = y_train.shape
    X_train_upsampled = X_train.repeat(n_classes, axis=0)
    y_train_direct = np.tile(range(n_classes), n_samples)
    sample_weights = (y_train * np.repeat(weights[train_index],6).reshape(len(weights[train_index]),6)).ravel()

    # Define Catboost classifier
    clf = catboost.CatBoostClassifier(task_type='GPU',
                                      objective='MultiClass',
                                      random_state=random_seed,
                                      verbose=False)

    # fit the model
    clf.fit(X_train_upsampled, y_train_direct, sample_weight=sample_weights)

    # define the validation set for the ith fold
    X_val = train.loc[valid_index,features].values
    y_val = train.loc[valid_index,classes].values

    # Mapping classes to integer labels
    TARGETS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

    # Pool data structure from catboost for the validation data
    val_pool = catboost.Pool(
        data = train.loc[valid_index,features],
        label = train.loc[valid_index,'expert_consensus'].map(TARGETS),
    )

    # Compute the feature importance using the validation data
    feature_importance['CV'+str(i+1)] = clf.get_feature_importance(val_pool)

    del X_train, y_train, X_train_upsampled, y_train_direct, X_val, y_val, clf, val_pool, TARGETS

  return pd.DataFrame(feature_importance, index = features)


## CatBoost Feature Selection

In [None]:
df_feature_importance = compute_feature_importance(train, FEATURES, vote_cols, weights_total_vote)

In [None]:
df_feature_importance.head()

In [None]:
# average featue importance across 5 folds
df_feature_importance['mean_feature_imp'] = df_feature_importance.mean(axis=1)
# sort by feature importance
df_feature_importance.sort_values(by='mean_feature_imp', inplace=True, ascending=False)

In [None]:
# cumulative importance
df_feature_importance['cumulative_imp'] = df_feature_importance.mean_feature_imp.cumsum()

In [None]:
# save the feature importance scores
df_feature_importance.to_parquet(data_path + 'feature_importance.parquet', compression='gzip')

In [None]:
# select feature corresponding to 90% of the feature importance
selected_features = df_feature_importance.iloc[np.where(~(df_feature_importance['cumulative_imp']>=90))[0],:].index.tolist()

In [None]:
# Reading the test set
test = pd.read_parquet(data_path + 'test.parquet')

In [None]:
print("Dimensions of train set: ", train.shape)
print("Dimensions of test set: ", test.shape)

In [None]:
# Keep only the selected features in the train and test sets
train_selected = pd.concat([train.iloc[:,:10], train[selected_features]], axis = 1)
test_selected = pd.concat([test.iloc[:,:10], test[selected_features]], axis = 1)

In [None]:
print("Dimensions of train set after feature selection: ", train_selected.shape)
print("Dimensions of test set after feature selection: ", test_selected.shape)

In [None]:
# save the train and test sets with only the selected features
train_selected.to_parquet(data_path + 'train_selected_features.parquet', compression='gzip')
test_selected.to_parquet(data_path + 'test_selected_features.parquet', compression='gzip')