# Package

In [2]:
from interpret.glassbox import LinearRegression, RegressionTree, ExplainableBoostingClassifier, ExplainableBoostingRegressor
from interpret import show
from interpret.perf import RegressionPerf

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

# Data

In [3]:
data = load_boston()

In [4]:
reg_df = pd.DataFrame(data.data, columns = data.feature_names)

In [5]:
reg_df['target'] = data.target

In [6]:
reg_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Split Data

In [7]:
X = reg_df.drop(['target'], axis = 1)
y = reg_df['target']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
X_train.shape, X_val.shape

((404, 13), (102, 13))

## EBM 사용

In [10]:
ebm = ExplainableBoostingRegressor(validation_size = 50, early_stopping_rounds = 100)

In [11]:
ebm.fit(X_train, y_train)

ExplainableBoostingRegressor(early_stopping_rounds=100,
                             feature_names=['CRIM', 'ZN', 'INDUS', 'CHAS',
                                            'NOX', 'RM', 'AGE', 'DIS', 'RAD',
                                            'TAX', 'PTRATIO', 'B', 'LSTAT',
                                            'DIS x LSTAT', 'CRIM x LSTAT',
                                            'AGE x LSTAT', 'B x LSTAT',
                                            'TAX x LSTAT', 'NOX x LSTAT',
                                            'INDUS x LSTAT', 'NOX x RM',
                                            'RM x PTRATIO', 'RM x TAX'],
                             feature_types=['continuous', 'continuous',
                                            'continuous', 'categorical',
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous', '

explain_global로 전역 변수 중요도를 얻을 수 있다.

In [12]:
show(ebm.explain_global())

array 형태로 explian_local 메서드에 넣으면 개별 데이터의 변수 중요도가 도출된다.

In [13]:
show(ebm.explain_local(X.values, y.values))

### 다른 Black Box 모델들과 비교

- CatBoost
- RandomForest

In [18]:
rf = RandomForestRegressor(random_state = 42)
cb = CatBoostRegressor(random_state = 42, silent = True)

In [19]:
rf.fit(X_train, y_train)
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7fdc9bc30520>

In [25]:
ebm_pred = ebm.predict(X_val)
rf_pred = rf.predict(X_val)
cb_pred = cb.predict(X_val)

#### CatBoost Sample

In [20]:
cb_perf = RegressionPerf(cb.predict).explain_perf(X_val, y_val, name = 'CatBoost')

RegressionPerf는 예측 데이터에 대한 Performance를 평가한다.

일반적인 prediction과 동일하다.

In [21]:
show(cb_perf)

In [22]:
from sklearn.metrics import mean_squared_error

In [26]:
def get_rmse(model, actual, pred) :

    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mse)

    print(f'{model.__class__.__name__} RMSE = {rmse}')

In [27]:
get_rmse(ebm, y_val, ebm_pred)
get_rmse(rf, y_val, rf_pred)
get_rmse(cb, y_val, cb_pred)

ExplainableBoostingRegressor RMSE = 3.2885815716545874
RandomForestRegressor RMSE = 2.8109631609391226
CatBoostRegressor RMSE = 2.779159498500485


위에서 RegressionPerf로 도출된 성능과 동일하다. prediction 수행이 병행되는 것 같다.

In [29]:
from interpret.blackbox import MorrisSensitivity

In [30]:
ebm_ss = MorrisSensitivity(predict_fn = ebm.predict, data = X_train)
rf_ss = MorrisSensitivity(predict_fn = rf.predict, data = X_train)
cb_ss = MorrisSensitivity(predict_fn = cb.predict, data = X_train)

In [32]:
ebm_global = ebm_ss.explain_global(name = 'EBM')
rf_global = rf_ss.explain_global(name = 'RandomForest')
cb_global = cb_ss.explain_global(name = 'CatBoost')

In [33]:
show([ebm_global, rf_global, cb_global])