In [None]:
%run default-imports.ipynb

In [None]:
%run run-experiment-cv.ipynb

In [None]:
%run get-feature-rank.ipynb #dict 'feature_rank' is defined here

In [None]:
filenames = {'MIMIC' : "~/cohorts/hs_mimic.csv", 'SINAI' : "~/cohorts/hs_sinai_preprocessed.csv", 'DHZB' : "~/cohorts/hs_dhzb.csv"}
save_path = './experiments/experiments_rfe_sinai_rank_feat_importance.d'

In [None]:
''' define options to run experiments on '''

#
# First run, RFE mode will run using 'weighted_explanations','feature_importance' for both algorithms
#
options = {
    'target': ["AKI"],
    'cohort': ['SINAI'],
    'test_size': [0.2],
    'imputation_method': [imputers.DEFAULT],
    'algorithm': [algorithms.RF, algorithms.GBDT],
    'sampling_method': [samplers.SMOTE],
    'sampling_strategy': [0.25],
    'rfe_mode' : ['weighted_explanations', 'feature_importance'],
    'scale_method': [None],
    'optimize_mode': [False],
    'num_exps_desired': [50],
    'num_features': [100],
    'mimic': [Ridge(solver='sag')], #sag is faster for large datasets
    #'explainers' : [[explainers.MIMIC, explainers.SHAP, explainers.FEAT_CONTRIB, explainers.LIME]],    
    'n_splits':[10],
}

#
# Second run, we comment out the lines below, to run the RFE using the features obtained via the MIMIC models
options['rfe_mode'] = ['prefit']
#options['feature_rank'] = [feature_rank[algorithms.RF]]
options['feature_rank'] = [feature_rank[algorithms.RF], feature_rank[algorithms.GBDT]]

#

''' retrieve previous experiments as the case may be '''
experiments = unpickle(save_path) or {}
n_experiments = 0

with Timer() as t:
    
    ''' iterate over different options '''
    for combination in product(*options.values()):

        ''' initialize parameters '''
        params = dict(zip(options.keys(), combination))
        #print(f"Running experiment with following parameters: {params}")
        exp_id = str(uuid.uuid1())
        experiment = defaultdict(lambda: {})

        ''' load the data '''
        data = Load().execute(filename=filenames[params['cohort']])
        
        ''' split the data '''
        train, test = Split().execute(data,test_size=params['test_size'])
        
        top_explanations = []
        
        ''' check if we're providing top_explanations as parameter, previous values will be overriden '''
        ''' we do this to compare how the metrics change when using top features from previous models, e.g., mimic '''
        if params.get('rfe_mode') == 'prefit':
            top_explanations = params.get('feature_rank')
        else:        
            ''' get feature rank for the complete train dataset '''
            experiment = run_experiment(params, train, test)

            ''' by default, we assume top explanations come from feature contributions '''
            top_explanations = list(sorted([(k,v) for k,v in experiment['explanations'][params['algorithm']]['FeatContribExplainer'].items()], key=lambda x: x[1])) # trying now with feature_importances

            ''' instead, if we take weighted explanations, then override top_explanations '''
            if params.get('rfe_mode') == 'weighted_explanations': top_explanations = experiment['weighted_explanations']         
        
        top_explanations = top_explanations[-params['num_features']:] #top num_features
        params['top_explanations'] = top_explanations
        max_features = 10
        
        ''' remove explainers from this point on, since we do not want to run them in the k-folds '''         
        if 'explainers' in params: del params['explainers']
        
        ''' now my data for subsequent steps becomes train '''
        data = train.copy()
        
        features, labels = data.drop(params['target'], axis=1), data[params['target']]        

        ''' initialize storage of metrics  '''
        metrics = ['precision', 'recall', 'f1-score', 'auc', 'dor']
                
        cv_experiments = []
        cv_performance = defaultdict(lambda: {})     
        rfe_experiments = defaultdict(lambda: [])
        rfe_performance = defaultdict(lambda: defaultdict(lambda: {}))

        ''' for each of the k-folds '''
        skf = StratifiedKFold(n_splits=params['n_splits'], random_state=None, shuffle=False)
        for train_index, test_index in skf.split(features, labels):                     
            
            ''' obtain data for the next experiments '''
            train_data = data.iloc[train_index]
            test_data = data.iloc[test_index]
            k_params = params
            
            ''' run one CV experiment that we will use for the cross-validated results '''
            print('Training model within the K-fold...')
            cv_experiments.append(run_experiment(params, train_data, test_data))
            
            ''' weighted explanations are a tuple of the form (feature, importance, weight, normalized, weight) '''
            features_to_select = [exp[0] for exp in top_explanations]
            
            ''' now for each of the feature sets, up to max_features '''
            while len(features_to_select) >= max_features:
                
                print(f'Training partial model with selected features #: {len(features_to_select)}')
                k_params['features_to_select'] = features_to_select
                k_params['num_features'] = len(features_to_select)
                rfe_experiments[k_params['num_features']].append(run_experiment(k_params, train_data, test_data))
                
                ''' remove lowest ranked feature (list is sorted A-Z, least to more important) '''
                del features_to_select[0]
                      
        ''' summarize results of crossvalidation '''        
        for metric in metrics:
            
            ''' cv results '''
            measurements = [exp['performance']['discrimination'][metric] for exp in cv_experiments]
            cv_performance[metric]['mean'] = np.mean(measurements)
            cv_performance[metric]['std'] = np.std(measurements)
            cv_performance[metric]['ci'] = np.std(measurements) * 2 #95% CI
            
            ''' rfe results '''
            for num_features in rfe_experiments:
                measurements = [exp['performance']['discrimination'][metric] for exp in rfe_experiments[num_features]]
                rfe_performance[num_features][metric]['mean'] = float(np.mean(measurements))
                rfe_performance[num_features][metric]['std'] = float(np.std(measurements))
                rfe_performance[num_features][metric]['ci'] = float(np.std(measurements) * 2) #95% CI
                rfe_performance[num_features][metric]['raw'] = measurements
        
        ''' summarize results of experiment '''
        experiment['parameters'] = params
        experiment['performance']['cv'] = dict(cv_performance)
        experiment['performance']['rfe'] = {k : dict(v) for k,v in rfe_performance.items()} #remove lambdas for storage
        
        ''' append to previously loaded experiments as the case may be '''
        experiments[exp_id] = experiment
        
        n_experiments += 1

print(f'Running {n_experiments} experiments took {t.interval:.03f} sec.')

''' store everything '''
if pickle(experiments, save_path):
    print('Successfully saved.')

In [None]:
print(options)