# Boruta test

* https://towardsdatascience.com/boruta-explained-the-way-i-wish-someone-explained-it-to-me-4489d70e154a
* https://github.com/scikit-learn-contrib/boruta_py

In [1]:
!pip install boruta



In [10]:
import pandas as pd
### make X and y
X = pd.DataFrame({'age': [25, 32, 47, 51, 62],
                  'height': [182, 176, 174, 168, 181],
                  'weight': [75, 71, 78, 72, 86]})
y = pd.Series([20, 32, 45, 55, 61], name = 'income')

In [4]:
X

Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71
2,47,174,78
3,51,168,72
4,62,181,86


In [5]:
y

0    20
1    32
2    45
3    55
4    61
Name: income, dtype: int64

In [11]:
type(X)

pandas.core.frame.DataFrame

In [24]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import numpy as np

###initialize Boruta
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 5
)

boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 10,
   max_iter = 20, # number of trials to perform
   verbose=1,
   alpha = 0.7  # reject p-value
)

### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X), np.array(y))

### print results
green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()

print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

Iteration: 1 / 20
Iteration: 2 / 20
Iteration: 3 / 20


BorutaPy finished running.

Iteration: 	4 / 20
Confirmed: 	1
Tentative: 	0
Rejected: 	2
features in the green area: ['age']
features in the blue area: []


In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# load X and y
# NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
X_df = pd.read_csv('test_X.csv', index_col=0)
X = X_df.values
y = pd.read_csv('test_y.csv', header=None, index_col=0).values
y = y.ravel()

In [33]:
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,-0.05194,-2.036221,0.613771,0.0,-0.652528,1.0,0.0,-0.895385,0.0
1,1.0,0.582915,1.047493,1.375456,1.0,-0.611177,1.0,1.0,-1.586095,3.0
2,1.0,0.726401,0.750928,1.16757,1.0,-0.527994,1.0,0.0,1.041794,0.0
3,1.0,1.373671,-0.672894,2.16226,1.0,1.353069,0.0,0.0,-0.44896,1.0
4,0.0,-0.059381,0.404213,-0.539591,1.0,-3.221016,0.0,0.0,-0.083078,1.0


In [37]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=1, random_state=1)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check ranking of features
print(feat_selector.ranking_)

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100


BorutaPy finished running.

Iteration: 	23 / 100
Confirmed: 	5
Tentative: 	0
Rejected: 	5
[1 1 1 1 1 2 3 5 2 4]


In [38]:
### print results
green_area = X_df.columns[feat_selector.support_].to_list()
blue_area = X_df.columns[feat_selector.support_weak_].to_list()

print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

features in the green area: ['0', '1', '2', '3', '4']
features in the blue area: []
