In [1]:
import os
import itertools
import pandas as pd
import numpy as np
import json


import sys
sys.path.insert(1, './')
from BorutaShap import BorutaShap
from sklearn.ensemble import RandomForestRegressor

  from .autonotebook import tqdm as notebook_tqdm


emulating: https://medium.com/analytics-vidhya/is-this-the-best-feature-selection-algorithm-borutashap-8bc238aa1677

In [2]:
seed = 42
n_cores = 20
data_path = '/nobackup/users/hmbaghda/metastatic_potential/'

In [3]:
X_train_val = pd.read_csv(os.path.join(data_path, 'interim', 'X_train_val.csv'), index_col = 0)
y_train_val = pd.read_csv(os.path.join(data_path, 'interim', 'y_train_val.csv'), index_col = 0)

In [41]:
res = {}
pvals = [0.1, 0.05]
percentiles = [0.25, 0.5, 0.75, 1]

combs = list(itertools.product(pvals, percentiles))

for idx, comb in enumerate(combs):
    pval, percentile = comb
    default_rf_model = RandomForestRegressor(n_jobs = n_cores, 
                                             random_state = seed # gives variety per model fit
                                            )

    boruta_selector = BorutaShap(model = default_rf_model,
                                 importance_measure='shap',
                                 classification=False,
                                 percentile = percentile, 
                                pvalue = pval)

    boruta_selector.fit(X=X_train_val, 
                        y=y_train_val, 
                        n_trials=100,
                        random_state=seed,
                        train_or_test='train')
    res[idx] = {'pval': pval, 
           'percentile': percentile, 
           'selected_features': boruta_selector.accepted}
    with open(os.path.join(data_path, 'interim', 'depr_boruta_features.json'), "w") as json_file:
        json.dump(res, json_file, indent=4)  

In [4]:
import json

with open(os.path.join(data_path, 'interim', 'depr_boruta_features.json'), 'r') as file:
    selected_features_dict = json.load(file)


In [15]:
# lected_features = pd.read_csv(os.path.join(data_path, 'interim', 'depr_selected_features.csv'), index_col = 0)
selected_features = pd.read_csv(os.path.join(data_path, 'interim', 'depr_selected_all_features.csv'), index_col = 0)

selected_feature_index = selected_features.index.tolist()
# selected_feature_index = open(os.path.join(data_path, 'interim', 'depre_selected_train.txt')).read().splitlines()


selected_feature_index = ['-'.join(i.split('.')) for i in selected_feature_index] # formatting R --> python

for idx, i in enumerate(selected_feature_index):
    if i == 'X5S_rRNA':
        selected_feature_index[idx] = '5S_rRNA'

In [18]:
for k, res in selected_features_dict.items():
    overlap = len(set(res['selected_features']).intersection(selected_feature_index))
    print('pval: {:.2f}, percentile: {:.2f}, total features: {}, overlap features: {}'.format(res['pval'], res['percentile'], len(res['selected_features']), overlap))


pval: 0.10, percentile: 0.25, total features: 198, overlap features: 18
pval: 0.10, percentile: 0.50, total features: 198, overlap features: 18
pval: 0.10, percentile: 0.75, total features: 198, overlap features: 18
pval: 0.10, percentile: 1.00, total features: 198, overlap features: 18
pval: 0.05, percentile: 0.25, total features: 197, overlap features: 18
pval: 0.05, percentile: 0.50, total features: 197, overlap features: 18
pval: 0.05, percentile: 0.75, total features: 197, overlap features: 18
pval: 0.05, percentile: 1.00, total features: 197, overlap features: 18


18

(0.1, 0.25, 198)