-
Notifications
You must be signed in to change notification settings - Fork 11
/
boruta.py
170 lines (124 loc) · 7.06 KB
/
boruta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
from time import time
import numpy as np
import pandas as pd
from scipy.stats import binom
try:
from tqdm import tqdm
except:
logging.warning('Boruta requires tqdm, which does not seem installed.')
class Boruta(object):
"""
Implementation of the Boruta feature selection algorithm.
Reference:
"""
def __init__(self, learner_func, path=None):
"""
Constructor.
Parameters
----------
learner_func : func | callable
Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features.
Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained.
There should be as many importance scores in :code:`feature_importances_` as columns in :code:`x_train`.
"""
self.selected_variables = []
self.learner_func = learner_func
self.path = path
def fit(self, x_df, y_df, n_evaluations=20, pval=0.95, max_duration=None):
"""
Performs a run of the Boruta feature selection algorithm.
Specifically, :code:`n_evaluations` times we randomly shuffle all feature values to define so-called 'shadow features', which we add to the original features.
The random shuffling destroys any association between shadow features and the target. Thus, none of them should be considered important.
We train the learner with both original and shadow features, and calculate the highest feature importance score among shadow features.
We consider an original feature a 'hit' if its feature importance score is higher than the highest feature importance score of shadow features, otherwise it is a miss.
For each original feature, we end up with the number of hits in the random :code:`n_evaluations` runs, and we use the following statistical test to conclude if the original feature is relevant.
The null hypothesis :math:`H_0` is that we do not know if an original feature is relevant for the problem at hand.
Under :math:`H_0`, the probability that a feature will be a hit in one of the random trials above is :math:`p=0.5`, and its number of hits after :code:`n_evaluations` runs follows a binomial distribution with probability :math:`0.5` and total number of trials :math:`n_evaluations`.
We reject :math:`H_0` and accept the alternative hypothesis :math:`H_1` that a feature is relevant when the number of hits observed is higher than the :code:`pval` quantile under :math:`H_0`.
We reject :math:`H_0` and accept the alternative hypothesis :math:`H_2` that a feature is *not* relevant when the number of hits observed is lower than the :code:`1-pval` quantile under :math:`H_0`.
A warning is raised when we were not able to reject :math:`H_0` for some features.
Features that were determined to be relevant are used to train the final learner, which this method returns.
When no feature was determined to be relevant, this method returns :code:`None`.
Parameters
----------
x_df : pd.DataFrame
A dataframe containing all features.
y_df : pd.DataFrame
A dataframe containing the target.
n_evaluations : int (default 20)
Number of trials in the Boruta algorithm.
pval : float (between 0 and 1, default 0.95)
Quantile level above which features are considered relevant.
max_duration : float | None (default)
If not None, then feature elimination will stop after this many seconds.
Attributes
----------
selected_variables : list
The list of features deemed relevant, sorted in decreasing number of hits.
ambiguous_variables : list
The list of features that we could not conclude are either relevant or irrelevant.
hits : dict
Dictionary containing the number of hits for each feature.
Returns
-------
m : sklearn-like model (an instance returned by :code:`learner_func`)
An instance returned by :code:`learner_func` trained with features that were deemed relevant.
"""
columns = [_ for _ in x_df.columns]
y = y_df.values
hits = {col: 0 for col in columns}
start_time = time()
for trial in tqdm(range(n_evaluations)):
# Construct shaddow features
train_x_df = pd.DataFrame(x_df.sample(frac=1).values, \
index=x_df.index, columns=['shaddow_%s' % col for col in columns])
# Add to real features
train_x_df = pd.concat([x_df, train_x_df], axis=1)
all_columns = [_ for _ in train_x_df.columns]
# Fit the model
x = train_x_df[all_columns].values
current_n_vars = len(all_columns)
m = self.learner_func(n_vars=current_n_vars)
m.fit(x, y)
# Increase the hit count
importances = [_ for _ in m.feature_importances_]
assert len(importances) == len(all_columns), \
'The number of importance scores should be %d, received %d' % (len(all_columns), len(importances))
hit_threshold = np.max([np.abs(importances[i]) \
for i in range(current_n_vars) if all_columns[i].startswith('shaddow_')])
for i in range(current_n_vars):
if not all_columns[i].startswith('shaddow_') and np.abs(importances[i]) > hit_threshold:
hits[all_columns[i]] = hits[all_columns[i]] + 1
duration = time()-start_time
if max_duration and duration > max_duration:
logging.warning('We have exceeded the configured maximum duration %.2fs: exiting after %d trials...' % (max_duration, trial+1))
n_evaluations = trial+1
break
# Testing whether the number of hits is statistically significant to conclude the feature is important.
# H0: We can't tell if a feature is important or not. Said differently, there is 50% chance that a feature will be a 'hit' in a round.
# H1: A feature is important.
# H2: A feature is unimportant.
# Under H0, the number of hits after n_evaluations trials should be in [hm, hM] with probability pval, where hm and hM are binomial quantiles.
# We reject H0 and accept H1 when a feature's number of hits exceeds hM.
# We reject H0 and accept H2 when a feature's number of hits is lower than hm.
cdfs = binom.cdf([hits[col] for col in columns], n_evaluations, 0.5)
selected_variables = [columns[i] for i in range(len(columns)) if cdfs[i] > pval]
associated_hits = [hits[col] for col in selected_variables]
self.selected_variables = [col for _, col in sorted(zip(associated_hits, selected_variables), reverse=True)]
self.ambiguous_variables = [columns[i] for i in range(len(columns)) if cdfs[i] <= pval and cdfs[i] >= 1.-pval]
if self.ambiguous_variables:
logging.warning('Features %s have not been determined important or unimportant. Consider increasing n_evaluations.' % \
str(self.ambiguous_variables))
self.hits = hits
if self.selected_variables == []:
logging.warning('No variable/feature was deemed important by the model.')
return None
# Keep only important features and retrain the model.
x = x_df[self.selected_variables].values
n_vars = len(self.selected_variables)
m = self.learner_func(n_vars=n_vars, path=self.path)
m.fit(x, y)
return m