# Boruta

In [1]:
# 명령 결과 모두 보기
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 버전 경고 무시
import warnings
warnings.filterwarnings('ignore')

# 판다스와 넘피
import pandas as pd
import numpy as np

# 그림 그리기
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

https://towardsdatascience.com/boruta-explained-the-way-i-wish-someone-explained-it-to-me-4489d70e154a

## 1. It all starts with X and y

In [2]:
X = pd.DataFrame({'age': [25, 32, 47, 51, 62],
                  'height': [182, 176, 174, 168, 181],
                  'weight': [75, 71, 78, 72, 86]})
y = pd.Series([20, 32, 45, 55, 61], name = 'income')
X
y

Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71
2,47,174,78
3,51,168,72
4,62,181,86


0    20
1    32
2    45
3    55
4    61
Name: income, dtype: int64

## 2. Why Boruta?
### 2.1 The first idea: shadow features
- In Boruta, features do not compete among themselves. 
- Instead — and this is the first brilliant idea — they compete with a randomized version of them.

#### make X_shadow by randomly permuting each column of X

In [3]:
np.random.seed(42)
X_shadow = X.apply(np.random.permutation)
X_shadow.columns = ['shadow_' + feat for feat in X.columns]

X
X_shadow

Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71
2,47,174,78
3,51,168,72
4,62,181,86


Unnamed: 0,shadow_age,shadow_height,shadow_weight
0,32,168,71
1,62,176,75
2,47,174,72
3,25,182,86
4,51,181,78


#### make X_boruta by appending X_shadow to X

In [4]:
X_boruta = pd.concat([X, X_shadow], axis = 1)
X_boruta

Unnamed: 0,age,height,weight,shadow_age,shadow_height,shadow_weight
0,25,182,75,32,168,71
1,32,176,71,62,176,75
2,47,174,78,47,174,72
3,51,168,72,25,182,86
4,62,181,86,51,181,78


#### fit a random forest (suggested max_depth between 3 and 7)

In [5]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(max_depth = 5, random_state = 42)
forest.fit(X_boruta, y)

RandomForestRegressor(max_depth=5, random_state=42)

#### store feature importances

In [6]:
X_boruta.columns
X.columns

Index(['age', 'height', 'weight', 'shadow_age', 'shadow_height',
       'shadow_weight'],
      dtype='object')

Index(['age', 'height', 'weight'], dtype='object')

In [7]:
forest.feature_importances_
feat_imp_X = forest.feature_importances_[:len(X.columns)]
feat_imp_shadow = forest.feature_importances_[len(X.columns):]
feat_imp_X
feat_imp_shadow

array([0.28112381, 0.15808892, 0.0925464 , 0.10162149, 0.21182001,
       0.15479937])

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

In [8]:
data = pd.DataFrame(np.c_[X_boruta.columns, forest.feature_importances_], columns = ['feature', 'model based importance'])
data

Unnamed: 0,feature,model based importance
0,age,0.281124
1,height,0.158089
2,weight,0.092546
3,shadow_age,0.101621
4,shadow_height,0.21182
5,shadow_weight,0.154799


#### compute hits

In [9]:
feat_imp_X                 # 원래 변수의 중요도
feat_imp_shadow.max()      # shadow 변수의 max 중요도

array([0.28112381, 0.15808892, 0.0925464 ])

0.21182001118946994

In [10]:
hits = feat_imp_X > feat_imp_shadow.max()
hits

array([ True, False, False])

In [11]:
data = pd.DataFrame((np.hstack([feat_imp_X, feat_imp_shadow]))).T
data.columns = X_boruta.columns
data.loc[1, X.columns] = hits
data

Unnamed: 0,age,height,weight,shadow_age,shadow_height,shadow_weight
0,0.281124,0.158089,0.092546,0.101621,0.21182,0.154799
1,True,False,False,,,


### 2.2 The second idea: binomial distribution
- 랜덤포레스트는 비모수 통계논리를 이용하기 때문에 100회 실시해서 중요한지 안한지를 확인

#### initialize hits counter

In [12]:
hits = np.zeros((len(X.columns)))
hits

array([0., 0., 0.])

#### repeat 20 times

In [13]:
for iter_ in range(3):
    np.random.seed(iter_)
    X.head(2)
    X.apply(np.random.permutation).head(2)
    print('*'*100)

Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71


Unnamed: 0,age,height,weight
0,47,182,71
1,25,174,72


****************************************************************************************************


Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71


Unnamed: 0,age,height,weight
0,47,182,78
1,32,174,72


****************************************************************************************************


Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71


Unnamed: 0,age,height,weight
0,47,181,75
1,62,176,86


****************************************************************************************************


In [14]:
for iter_ in range(20):
    ### make X_shadow by randomly permuting each column of X
    np.random.seed(iter_)
    X_show = X.apply(np.random.permutation)
    X_boruta = pd.concat([X, X_shadow], axis = 1)
    
    ### fit a random forest (suggested max_depth between 3 and 7)
    forest = RandomForestRegressor(max_depth = 5, random_state = 42)
    forest.fit(X_boruta, y)

    ### store feature importance
    feat_imp_X = forest.feature_importances_[:len(X.columns)]
    feat_imp_shadow = forest.feature_importances_[len(X.columns):]

    ### compute hits for this trial and add to counter
    hits += (feat_imp_X > feat_imp_shadow.max())
    hits

RandomForestRegressor(max_depth=5, random_state=42)

array([1., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([2., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([3., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([4., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([5., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([6., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([7., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([8., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([9., 0., 0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([10.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([11.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([12.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([13.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([14.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([15.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([16.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([17.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([18.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([19.,  0.,  0.])

RandomForestRegressor(max_depth=5, random_state=42)

array([20.,  0.,  0.])

In [15]:
hits

array([20.,  0.,  0.])

In [16]:
hits = np.zeros((len(X.columns)))
original =[]
shadow = []
hits_list =[]

for iter_ in range(20):
    ### make X_shadow by randomly permuting each column of X
    np.random.seed(iter_)
    X_show = X.apply(np.random.permutation)
    X_boruta = pd.concat([X, X_shadow], axis = 1)
    
    ### fit a random forest (suggested max_depth between 3 and 7)
    forest = RandomForestRegressor(max_depth = 5, random_state = 42)
    forest.fit(X_boruta, y)

    ### store feature importance
    feat_imp_X = forest.feature_importances_[:len(X.columns)]
    feat_imp_shadow = forest.feature_importances_[len(X.columns):]

    feat_imp_X 
    feat_imp_shadow
    
    ### compute hits for this trial and add to counter
    hits += (feat_imp_X > feat_imp_shadow.max())
    hits
    print('*'*100)
    
    
    original.append(feat_imp_X)
    shadow.append(feat_imp_shadow)
    hits_list.append(hits)

RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([1., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([2., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([3., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([4., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([5., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([6., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([7., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([8., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([9., 0., 0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([10.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([11.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([12.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([13.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([14.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([15.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([16.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([17.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([18.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([19.,  0.,  0.])

****************************************************************************************************


RandomForestRegressor(max_depth=5, random_state=42)

array([0.28112381, 0.15808892, 0.0925464 ])

array([0.10162149, 0.21182001, 0.15479937])

array([20.,  0.,  0.])

****************************************************************************************************


In [17]:
original
shadow
hits_list

[array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ]),
 array([0.28112381, 0.15808892, 0.0925464 ])]

[array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937]),
 array([0.10162149, 0.21182001, 0.15479937])]

[array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.]),
 array([20.,  0.,  0.])]

In [18]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
###initialize Boruta
forest = RandomForestRegressor(n_jobs = -1, max_depth = 5)

boruta = BorutaPy(estimator = forest, 
                  n_estimators = 'auto',
                  max_iter = 100 # number of trials to perform
                 )
### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X), np.array(y))
### print results
green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

BorutaPy(estimator=RandomForestRegressor(max_depth=5, n_estimators=28,
                                         n_jobs=-1,
                                         random_state=RandomState(MT19937) at 0x23599BF3A40),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x23599BF3A40)

features in the green area: []
features in the blue area: ['age']


# 예제
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html#sklearn.datasets.make_classification

In [19]:
from sklearn.datasets import make_classification
x, y = make_classification(n_features=100, n_redundant=3, n_informative=10,
                             n_classes= 2)
x.shape
y.shape

(100, 100)

(100,)

In [20]:
###initialize Boruta
forest = RandomForestRegressor(n_jobs = -1, max_depth = 2)

boruta = BorutaPy(estimator = forest, 
                  n_estimators = 'auto',
                  max_iter = 5 # number of trials to perform
                 )
### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(x, y)
boruta.support_*1
boruta.support_weak_*1

BorutaPy(estimator=RandomForestRegressor(max_depth=2, n_estimators=707,
                                         n_jobs=-1,
                                         random_state=RandomState(MT19937) at 0x23599BF3A40),
         max_iter=5, n_estimators='auto',
         random_state=RandomState(MT19937) at 0x23599BF3A40)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# End