In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import sys; sys.path.insert(0, '../..') # add parent folder path where lib folder is

In [4]:
import ray
import pandas as pd
from utils import helper, config, rayer
from ml.models.ensemble_v2 import Ensemble
from ml.xai.model.explainable import Explainable


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from ml.models import common


In [5]:
rayer.get_global_cluster(num_cpus=5)
!ray status --address='raycluster-autoscaler-head-svc.dev.svc.cluster.local:6379'

Node status
---------------------------------------------------------------
Healthy:
 1 head-group
 2 small-group
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 0.0/5.0 CPU
 0.00/16.764 GiB memory
 0.00/4.909 GiB object_store_memory

Demands:
 {'CPU': 1}: 5+ from request_resources()
[0m

In [3]:
!ray status --address='raycluster-autoscaler-head-svc.dev.svc.cluster.local:6379'

Node status
---------------------------------------------------------------
Healthy:
 1 head-group
 3 small-group
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 2.0/7.0 CPU
 0.00/24.214 GiB memory
 0.00/7.114 GiB object_store_memory

Demands:
 {'CPU': 1}: 5+ from request_resources()
[0m

In [7]:
# ray.autoscaler.sdk.request_resources(num_cpus=0)

In [6]:
df_X, df_y = helper.get_covid_dataset()
df_X = df_X.drop(['location'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.33, random_state=config.rand_state)

In [7]:
ss = StandardScaler()

X_train_scalar = pd.DataFrame(ss.fit_transform(X_train.copy()), columns=X_train.columns)
X_test_scalar = pd.DataFrame(ss.fit_transform(X_test.copy()), columns=X_test.columns)


In [8]:
list_base_models = ['BriskXGBoost', 'SlugXGBoost', 'SlugANN', 'SlugRF', 'SlugKNN', 'BriskBagging']

ensemble_set = Ensemble(
                        list_base_models = list_base_models,
                        n_trials=10, ### for all models
                        boosted_round=50, ### for tree models only
                        epochs=15, ### for DNN models

                        ensemble_boosted_round=10, ### for ensemble model which is also a free model
                        ensemble_n_trials=3,
                                )



In [9]:
ensemble_set.fetch_models(X_train_scalar, X_test_scalar, y_train, y_test, threshold=None)

2022-11-23T06:52:56CST : INFO : ensemble_v2 : fetch_models : 108 : Message : Ensemble: starting discovery process for models [<ml.models.base.v2.brisk_xgboost.BriskXGBoost object at 0x7f7d580b5940>, <ml.models.base.v2.slug_xgboost.SlugXGBoost object at 0x7f7d580b4730>, <ml.models.base.v2.slug_ann.SlugANN object at 0x7f7d580b4910>, <ml.models.base.v2.slug_rf.SlugRF object at 0x7f7d580b4880>, <ml.models.base.v2.slug_knn.SlugKNN object at 0x7f7d580b4520>, <ml.models.base.v2.brisk_bagging.BriskBagging object at 0x7f7d580b4670>]
[2m[33m(raylet)[0m [2022-11-23 04:53:00,387 E 74 74] (raylet) worker_pool.cc:1108: Failed to send exit request: GrpcUnavailable: RPC Error message: Connection reset by peer; RPC Error details: 
[2m[36m(__run_discoveries__ pid=581, ip=10.10.48.229)[0m 2022-11-23T04:53:00PST : INFO : brisk_xgboost : __discover_model__ : 109 : Message : brisk_xgb: Starting training for trials:10, boosted rounds: 50, max depth: 10
[2m[36m(__run_discoveries__ pid=581, ip=10.10.48

In [12]:
ensemble_set.base_model_scores

[[-0.780242599582226,
  -0.8618234540916516,
  <ml.models.base.v2.brisk_xgboost.BriskXGBoost at 0x7f7d580b4340>],
 [-0.36896820977009215,
  -0.6666532672303094,
  <ml.models.base.v2.slug_xgboost.SlugXGBoost at 0x7f7d3ebb5460>],
 [0.5712783142765323,
  0.16147673240819727,
  <ml.models.base.v2.slug_ann.SlugANN at 0x7f7d3ebb5730>],
 [0.6471896784078507,
  0.4251536230476787,
  <ml.models.base.v2.slug_rf.SlugRF at 0x7f7d580b4f40>],
 [0.6695206594539054,
  0.5401046443169623,
  <ml.models.base.v2.slug_knn.SlugKNN at 0x7f7d3e83ad90>],
 [0.9506921261413173,
  0.5390826970012106,
  <ml.models.base.v2.brisk_bagging.BriskBagging at 0x7f7d3e7d0070>]]

In [13]:
attr_algos = ['IG', 'SHAP', 'GradientSHAP']

In [14]:
ex = Explainable(ensemble_set, df_X)
ex.get_attr(attr_algos)

2022-11-23T06:55:12CST : INFO : explainable : get_attr : 148 : Message : attribution methods  ['ig', 'shap', 'gradientshap']
2022-11-23T06:55:12CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  brisk_xgb
2022-11-23T06:55:13CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  slug_xgb
2022-11-23T06:55:13CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  slug_ann
2022-11-23T06:55:13CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  slug_rf
2022-11-23T06:55:13CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  slug_knn
2022-11-23T06:55:13CST : INFO : explainable : get_attr : 151 : Message : calculating variable importance on  brisk_bagging
[2m[36m(__get_ig_attr__ pid=581, ip=10.10.48.229)[0m IPython could not be loaded!
[2m[36m(__get_gs_attr__ pid=526, ip=10.10.105.191)[0m   return torch.from_

Unnamed: 0,brisk_xgb_shap,slug_xgb_shap,slug_ann_ig,slug_ann_shap,slug_ann_gradientshap,slug_rf_shap,slug_knn_shap,brisk_bagging_shap
new_cases_per_million,0.0,0.08579548,3759912.0,1884617.0,3051007.0,763.94664,14.309589,0.530441
population_cov,0.000273,0.5284828,361956300.0,393710100.0,536733100.0,5.718554,39.301761,0.0
life_expectancy_cov,0.0,0.0,420.0456,30.72453,45.08412,1.469238,0.612097,0.0
Population,0.0,0.1901932,472779800.0,515593000.0,702754300.0,12.663485,38.241803,0.0
Area_km2,0.000253,0.001583993,3285954.0,3684905.0,5693634.0,1.066764,28.207985,0.0
Density_km2,0.0,0.001061057,21100.41,8469.975,6852.935,3.000385,0.478778,0.217915
Year_x,0.0,0.0,1499.756,0.0,0.03092911,0.0,0.0,0.0
Meningitis,0.0,0.0,0.0377746,0.04144062,0.7853202,1.300893,0.412961,0.0
Neoplasms,0.0,0.0,0.8472476,0.1947015,0.3114214,18.083464,0.255938,0.041545
"Fire, heat, and hot substances",0.0,0.00103188,0.01955676,0.008841133,0.5811997,0.348636,0.304805,0.0


[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m 2022-11-23T04:55:26PST : INFO : _kernel : solve : 549 : Message : np.sum(w_aug) = 39.000000000000014
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m 2022-11-23T04:55:26PST : INFO : _kernel : solve : 550 : Message : np.sum(self.kernelWeights) = 1.0000000000000004
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m The default of 'normalize' will be set to False in version 1.2 and deprecated in version 1.4.
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m 
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m from sklearn.pipeline import make_pipeline
[2m[36m(__get_shapley_kernel_attr__ pid=526, ip=10.10.105.191)[0m 
[2m[36m(__get_shapley_kernel_attr

In [15]:
ex.df_scores

Unnamed: 0,brisk_xgb_shap,slug_xgb_shap,slug_ann_ig,slug_ann_shap,slug_ann_gradientshap,slug_rf_shap,slug_knn_shap,brisk_bagging_shap
new_cases_per_million,0.0,0.08579548,3759912.0,1884617.0,3051007.0,763.94664,14.309589,0.530441
population_cov,0.000273,0.5284828,361956300.0,393710100.0,536733100.0,5.718554,39.301761,0.0
life_expectancy_cov,0.0,0.0,420.0456,30.72453,45.08412,1.469238,0.612097,0.0
Population,0.0,0.1901932,472779800.0,515593000.0,702754300.0,12.663485,38.241803,0.0
Area_km2,0.000253,0.001583993,3285954.0,3684905.0,5693634.0,1.066764,28.207985,0.0
Density_km2,0.0,0.001061057,21100.41,8469.975,6852.935,3.000385,0.478778,0.217915
Year_x,0.0,0.0,1499.756,0.0,0.03092911,0.0,0.0,0.0
Meningitis,0.0,0.0,0.0377746,0.04144062,0.7853202,1.300893,0.412961,0.0
Neoplasms,0.0,0.0,0.8472476,0.1947015,0.3114214,18.083464,0.255938,0.041545
"Fire, heat, and hot substances",0.0,0.00103188,0.01955676,0.008841133,0.5811997,0.348636,0.304805,0.0


In [24]:
ex.df_scores['cols'] = ex.df_scores.index
ex.df_scores.to_csv('df_scores_3.csv', sep=';', index=False)

In [5]:
df = pd.read_csv('df_scores_3.csv', sep=';')

In [6]:
df_scores_ranked = pd.DataFrame()

for col in df:
    if col != 'cols':
        df_scores_ranked[col] = df[col].rank(na_option = 'bottom', ascending=True, method='max', pct=False)
        # df_scores_ranked
        df_scores_ranked.replace(to_replace = df_scores_ranked.min(), value = 0, inplace=True)


In [7]:
#df_scores_ranked = df_scores_ranked.mode(axis=1)[0]

In [8]:
df_scores_ranked['cols'] = df['cols']

In [9]:
df_scores_ranked

Unnamed: 0,brisk_xgb_shap,slug_xgb_shap,slug_ann_ig,slug_ann_shap,slug_ann_gradientshap,slug_rf_shap,slug_knn_shap,brisk_bagging_shap,cols
0,0.0,37.0,38.0,37.0,37.0,40.0,37.0,38.0,new_cases_per_million
1,40.0,40.0,39.0,39.0,39.0,28.0,40.0,0.0,population_cov
2,0.0,0.0,34.0,34.0,34.0,17.0,33.0,0.0,life_expectancy_cov
3,0.0,39.0,40.0,40.0,40.0,31.0,39.0,0.0,Population
4,39.0,29.0,37.0,38.0,38.0,13.0,38.0,0.0,Area_km2
5,0.0,27.0,36.0,36.0,36.0,22.0,26.0,37.0,Density_km2
6,0.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,Year_x
7,0.0,0.0,12.0,17.0,27.0,15.0,23.0,0.0,Meningitis
8,0.0,0.0,30.0,30.0,11.0,35.0,12.0,35.0,Neoplasms
9,0.0,26.0,9.0,8.0,19.0,7.0,14.0,0.0,"Fire, heat, and hot substances"


In [14]:
res = df_scores_ranked.mode(axis=1, numeric_only=True).mean(axis=1)
res.index = df_scores_ranked['cols']

In [17]:
res.sort_values(ascending=False)

cols
Population                                    40.000000
population_cov                                39.500000
Area_km2                                      38.000000
new_cases_per_million                         37.000000
Density_km2                                   36.000000
Total                                         22.375000
Neoplasms                                     21.666667
Cirrhosis and other chronic liver diseases    21.125000
Lower respiratory infections                  20.500000
life_expectancy_cov                           17.000000
Self-harm                                     13.666667
Neonatal disorders                            12.500000
Road injuries                                 10.000000
Drowning                                       5.500000
Nutritional deficiencies                       2.500000
Diabetes mellitus                              0.000000
Poisonings                                     0.000000
Protein-energy malnutrition                