In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import sys; sys.path.insert(0, '../..') # add parent folder path where lib folder is

In [10]:
import ray
import time

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score


from utils import helper, config, rayer, kaggle_dataset_helper
from ml.models.base.brisk_xgboost import BriskXGBoost
from ml.models.base.slug_xgboost import SlugXGBoost
from ml.models.base.slug_ann import SlugANN
from ml.models.base.slug_rf import SlugRF
from ml.models.base.slug_knn import SlugKNN
from ml.models.base.brisk_bagging import BriskBagging


from ml.models import common


In [11]:
rayer.get_global_cluster(num_cpus=4)

In [12]:
!ray status --address='raycluster-autoscaler-head-svc.dev.svc.cluster.local:6379'

[2m[1m[36m(scheduler +2s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[2m[1m[36m(scheduler +2s)[0m Adding 1 node(s) of type small-group.
Node status
---------------------------------------------------------------
Healthy:
 1 head-group
 1 small-group
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 0.0/3.0 CPU
 0.00/9.313 GiB memory
 0.00/2.697 GiB object_store_memory

Demands:
 {}: 1+ pending tasks/actors
 {'CPU': 1}: 4+ from request_resources()
[0m

In [13]:

@ray.remote
def worker(base_model):     
    base_model.fetch_model()
    return base_model


In [14]:
ds_train, ds_test = kaggle_dataset_helper.get_house_prices_dataset()
ds_train = common.label_encode(ds_train)
ds_test = common.label_encode(ds_test)

ds_train = ds_train.fillna(-1)
ds_test = ds_test.fillna(-1)


df_X = ds_train.loc[:, ds_train.columns != 'SalePrice']
df_y = ds_train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.33, random_state=config.rand_state)

[2m[1m[36m(scheduler +8s)[0m Resized to 5 CPUs.


In [94]:
brisk_xgb1 = BriskXGBoost('brisk_xgb1', X_train, X_test, y_train, y_test)
brisk_xgb1.boosted_round = 10
brisk_xgb1.n_trials = 10

brisk_xgb2 = BriskXGBoost('brisk_xgb2', X_train, X_test, y_train, y_test)
brisk_xgb2.boosted_round = 10
brisk_xgb2.n_trials = 10

base_models = [brisk_xgb1, brisk_xgb2]

slug_xgb1 = SlugXGBoost('slug_xgb1', X_train, X_test, y_train, y_test)
slug_xgb1.boosted_round = 10
slug_xgb1.n_trials = 10

slug_xgb2 = SlugXGBoost('slug_xgb2', X_train, X_test, y_train, y_test)
slug_xgb2.boosted_round = 10
slug_xgb2.n_trials = 10

base_models_slug = [slug_xgb1, slug_xgb2]

slug_ann_1 = SlugANN('slug_ann_1', X_train, X_test, y_train, y_test)
slug_ann_1.epochs = 50
slug_ann_1.n_trials = 50

slug_ann_2 = SlugANN('slug_ann_2', X_train, X_test, y_train, y_test)
slug_ann_2.epochs = 50
slug_ann_2.n_trials = 50

base_models_ann = [slug_ann_1, slug_ann_2]


slug_rf_1 = SlugRF('slug_rf_1', X_train, X_test, y_train, y_test)
slug_rf_1.max_n_estimators = 150
slug_rf_1.n_trials = 50

slug_rf_2 = SlugRF('slug_rf_2', X_train, X_test, y_train, y_test)
slug_rf_2.max_n_estimators = 100
slug_rf_2.n_trials = 50

base_models_rf = [slug_rf_1, slug_rf_2]


slug_knn_1 = SlugKNN('slug_knn_1', X_train, X_test, y_train, y_test)
slug_knn_1.n_neighbors = 50
slug_knn_1.n_trials = 1000

slug_knn_2 = SlugKNN('slug_knn_2', X_train, X_test, y_train, y_test)
slug_knn_2.n_neighbors = 40
slug_knn_2.n_trials = 1000

base_models_knn = [slug_knn_1, slug_knn_2]


bagging_1 = BriskBagging('bagging_1', X_train, X_test, y_train, y_test)
bagging_1.n_estimators = 50
bagging_1.n_trials = 100

bagging_2 = BriskBagging('bagging_2', X_train, X_test, y_train, y_test)
bagging_2.n_estimators = 40
bagging_2.n_trials = 200

base_models_bagging = [bagging_1, bagging_2]

In [99]:
model_results = ray.get([worker.remote(base_model) for base_model in base_models_knn])

[2m[36m(worker pid=130, ip=10.10.72.10)[0m 2022-11-16T06:45:10PST : INFO : slug_knn : __discover_model__ : 94 : Message : slug_knn_2: Starting training for trials:1000, neighbors  40
[2m[36m(worker pid=288)[0m 2022-11-16T06:45:10PST : INFO : slug_knn : __discover_model__ : 94 : Message : slug_knn_1: Starting training for trials:1000, neighbors  50
[2m[36m(worker pid=288)[0m The default storage cannot be shared by multiple processes. Please use an RDB (RDBStorage) when you use joblib for multi-processing. The usage of RDBStorage can be found in https://optuna.readthedocs.io/en/stable/tutorial/rdb.html.
[2m[36m(worker pid=130, ip=10.10.72.10)[0m 2022-11-16T06:45:38PST : INFO : slug_knn : __discover_model__ : 111 : Message : slug_knn_2: Number of trials: 1000
[2m[36m(worker pid=130, ip=10.10.72.10)[0m 2022-11-16T06:45:38PST : INFO : slug_knn : __discover_model__ : 113 : Message : Best trial:22
[2m[36m(worker pid=130, ip=10.10.72.10)[0m 2022-11-16T06:45:38PST : INFO : slu

In [100]:
models = [model.best_fit for model in model_results]
models

[KNeighborsRegressor(algorithm='kd_tree', n_neighbors=43),
 KNeighborsRegressor(n_neighbors=35)]

In [107]:
#models[0].attr

# import json
# js = json.loads(models[0].save_config())
# js
# js['learner']['gradient_booster']['name']
# json.loads(models[0].save_config())['learner']['gradient_booster']['name']

In [108]:
models[0]

In [21]:
import shap
import pandas as pd

@ray.remote
def __get_shapley_ensemble_attr__(model, df_X):
    #df_X = ray.get(df_X_id)
    print(type(df_X))
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(df_X)
    df_shapley_sores = pd.DataFrame(shap_values.values, columns=df_X.columns)
    df_shapley_sores_list = df_shapley_sores.abs().mean().values #sort_values(ascending=False).values
    return df_shapley_sores_list


@ray.remote
def __get_shapley_kernel_attr__(model, df_X,  n_background):
    df_background = df_X.sample(n = n_background)
    kernel_explainer = shap.KernelExplainer(model.predict, df_background)
    kernel_shap_values = kernel_explainer.shap_values(X=X_train)# , ranked_outputs=True, check_additivity=False)    
    df_shapley_sores = pd.DataFrame(kernel_shap_values.values, columns=df_X.columns)
    df_shapley_sores_list = df_shapley_sores.abs().mean().values #sort_values(ascending=False).values
    return df_shapley_sores_list


In [None]:

@ray.remote
def __get_shapley_torch_attr__(model, df_X, n_background):
    
    df_background = df_X.sample(n = n_background)
    df_tensor_background = helper.df_to_tensor(df_background)
    df_X_tensor = helper.df_to_tensor(df_X)                

    explainer_shap = shap.DeepExplainer(model=model, data=df_tensor_background)

    shap_values = explainer_shap.explainer.shap_values(X=df_X_tensor, ranked_outputs=True, check_additivity=False)


    df_shapley_sores = pd.DataFrame(shap_values, columns=df_X.columns)
    df_shapley_sores_list = df_shapley_sores.abs().mean().values #sort_values(ascending=False).values

    return df_shapley_sores_list

In [22]:
X_train_id = ray.put(X_train)
results = ray.get([__get_shapley_ensemble_attr__.remote(model, X_train_id) for model in models])

[2m[36m(__get_shapley_ensemble_attr__ pid=130, ip=10.10.72.10)[0m <class 'pandas.core.frame.DataFrame'>


[2m[36m(worker pid=130, ip=10.10.72.10)[0m IPython could not be loaded!


RayTaskError(AssertionError): [36mray::__get_shapley_ensemble_attr__()[39m (pid=130, ip=10.10.72.10)
  File "/tmp/ipykernel_356/878023724.py", line 9, in __get_shapley_ensemble_attr__
  File "/tmp/ray/session_2022-11-16_05-22-01_174349_8/runtime_resources/pip/6359c2ba15c7e71f0e73b7159cb8c36699f062e1/virtualenv/lib/python3.9/site-packages/shap/explainers/_tree.py", line 149, in __init__
    self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)
  File "/tmp/ray/session_2022-11-16_05-22-01_174349_8/runtime_resources/pip/6359c2ba15c7e71f0e73b7159cb8c36699f062e1/virtualenv/lib/python3.9/site-packages/shap/explainers/_tree.py", line 824, in __init__
    xgb_loader = XGBTreeModelLoader(self.original_model)
  File "/tmp/ray/session_2022-11-16_05-22-01_174349_8/runtime_resources/pip/6359c2ba15c7e71f0e73b7159cb8c36699f062e1/virtualenv/lib/python3.9/site-packages/shap/explainers/_tree.py", line 1455, in __init__
    assert self.name_gbm == "gbtree", "Only the 'gbtree' model type is supported, not '%s'!" % self.name_gbm
AssertionError: Only the 'gbtree' model type is supported, not 'dart'!

[2m[36m(__get_shapley_ensemble_attr__ pid=288)[0m <class 'pandas.core.frame.DataFrame'>


[2m[36m(worker pid=288)[0m IPython could not be loaded!


In [16]:
results

[array([    0.        ,     0.        ,     0.        ,     0.        ,
          121.9312086 ,     0.        ,     0.        ,   128.50971113,
            0.        ,     0.        ,     0.        ,     0.        ,
            0.        ,     0.        ,     0.        ,     0.        ,
            0.        , 43238.73968161,     0.        ,  1973.06564969,
          178.13057451,     0.        ,     0.        ,     0.        ,
            0.        ,     0.        ,     0.        ,    66.02973515,
            0.        ,     0.        ,   100.61496257,     0.        ,
            0.        ,   232.05212035,   758.44243174,     0.        ,
            0.        ,     0.        ,  2787.93408754,     0.        ,
            0.        ,     0.        ,     0.        ,  1139.44870454,
            0.        ,     0.        , 13245.8969563 ,     0.        ,
            0.        ,   394.21993647,     0.        ,     0.        ,
            0.        ,   241.16066089,     0.        ,     0.  

In [104]:
### SHAPLEY arguments
shapley_background_size = 0.3 # 30% of actual dataset
shapley_n_background = int(X_train.shape[0]*shapley_background_size)
df_background = X_train.sample(n = shapley_n_background)
dtrain = xgb.DMatrix(X_train)
ddf_background = xgb.DMatrix(df_background)


In [110]:
### for bagging and knn
kernel_explainer = shap.KernelExplainer(models[0].predict, df_background)
kernel_shap_values = kernel_explainer.shap_values(X=X_train) # , ranked_outputs=True, check_additivity=False)

X does not have valid feature names, but KNeighborsRegressor was fitted with feature names


  0%|          | 0/978 [00:00<?, ?it/s]

X does not have valid feature names, but KNeighborsRegressor was fitted with feature names
2022-11-16T08:48:37CST : INFO : _kernel : explain : 327 : Message : num_full_subsets = 1
2022-11-16T08:48:37CST : INFO : _kernel : explain : 338 : Message : remaining_weight_vector = [0.13061412 0.08822182 0.06704858 0.05436371 0.04592368 0.03990987
 0.03541298 0.0319279  0.02915156 0.02689114 0.02501813 0.02344356
 0.02210393 0.02095268 0.01995493 0.01908404 0.01831928 0.01764436
 0.01704625 0.01651443 0.01604033 0.01561691 0.01523831 0.01489968
 0.01459693 0.01432662 0.01408584 0.01387212 0.01368338 0.01351786
 0.01337405 0.01325071 0.01314678 0.01306141 0.01299391 0.01294374
 0.01291051 0.01289396]
2022-11-16T08:48:37CST : INFO : _kernel : explain : 339 : Message : num_paired_subset_sizes = 39
2022-11-16T08:48:37CST : INFO : _kernel : explain : 378 : Message : weight_left = 0.7949889283823913
X does not have valid feature names, but KNeighborsRegressor was fitted with feature names
2022-11-16T

KeyboardInterrupt: 

In [36]:
tree_explainer = shap.TreeExplainer(model.predict, df_background)
tree_shap_values = tree_explainer.shap_values(X=X_train)# , ranked_outputs=True, check_additivity=False)

InvalidModelError: Model type not yet supported by TreeExplainer: <class 'method'>

In [17]:
explainer

<shap.explainers._kernel.Kernel at 0x7f7d80c5e910>

In [None]:
df_shapley_sores = pd.DataFrame(shap_values.values, columns=X_train.columns)
df_shapley_sores_list = df_shapley_sores.abs().mean().values #sort_values(ascending=False).values

In [71]:
from ml.xai.model.explainable import Explainable

In [111]:
ex = Explainable(X_train, models)

In [1]:
attr_algos = ['SHAP']
#ex.get_attr(attr_algos)

In [2]:
#ex.df_scores