In [1]:
import sys; sys.path.insert(0, '../..') # add parent folder path where lib folder is

%load_ext autoreload
%autoreload 2

from utils import helper, config, rayer, kaggle_dataset_helper

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from ml.models.ensemble_tune import Ensemble
from ml.xai.model.explainable_tune import Explainable

from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from ml.models import common

import ray


import time
import pandas as pd
import pyarrow.fs

In [2]:
rayer.get_global_cluster()

In [3]:
def use_tokamat_ds():
    df = helper.get_tokamat_dataset()
    df = df.reset_index()

    df = common.label_encode(df)
    df = df.fillna(-1)

    potential_targets = ['WTOT', 'WTH', 'PLTH']
    y = df[potential_targets[0]]

    X = df[df.columns[~df.columns.isin(potential_targets)]]
    X = X.drop(['TOK_ID', 'LCUPDATE', 'DATE', 'NEL', 'ENBI'], axis = 1)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    return train_test_split(X, y, test_size=0.33)


def use_covid_ds():
    df_X, df_y = helper.get_covid_dataset()
    df_X = df_X.drop(['location'], axis = 1)

    return train_test_split(df_X, df_y, test_size=0.33)


def use_transaction_predictions_ds():
    ds_train, ds_test = kaggle_dataset_helper.get_transaction_predictions_dataset()
    ds_train = common.label_encode(ds_train)
    ds_test = common.label_encode(ds_test)

    ds_train = ds_train.fillna(-1)
    ds_test = ds_test.fillna(-1)

    df_X = ds_train.loc[:, ds_train.columns != 'target']
    df_y = ds_train['target']

    return train_test_split(df_X, df_y, test_size=0.33, random_state=config.rand_state)



def use_house_pricing_ds():
    ds_train, ds_test = kaggle_dataset_helper.get_house_prices_dataset()
    ds_train = common.label_encode(ds_train)
    ds_test = common.label_encode(ds_test)

    ds_train = ds_train.fillna(-1)
    ds_test = ds_test.fillna(-1)


    df_X = ds_train.loc[:, ds_train.columns != 'SalePrice']
    df_y = ds_train['SalePrice']

    return train_test_split(df_X, df_y, test_size=0.33, random_state=config.rand_state)


In [4]:
X_train, X_test, y_train, y_test = use_tokamat_ds()

/mnt/c/Users/rwmas/GitHub/xai/python-asd/xai/notebooks/tune_sklearn/../../utils


In [5]:
r2_scoring = make_scorer(score_func=r2_score, greater_is_better=False)

In [None]:
ens_mdl = Ensemble(   
                                xgb_objective='binary:logistic',  # ["reg:squarederror", "count:poisson", "binary:logistic",  "binary:hinge" ]
                                lgbm_objective='binary',    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
                                pred_class='classification',
                                score_func=None,
                                metric_func=None,
                                list_base_models=[],
                                n_trials=10,          ### common param
                                epochs=15,             ### ANN param
                                boosted_round=10,      ### boosting tree param
                                max_depth=30,          ### boosting tree param
                                max_n_estimators=1500, ### rf param
                                n_estimators=30,       ### bagging param, must be > 10 
                                n_neighbors=30,        ### knn param, must be > 5

                                ensemble_n_estimators=30,  ###  must be > 10
                                ensemble_n_trials=10,
                                timeout=None
                 )

In [5]:
# list_base_models = ['briskbagging', 'briskknn', 'briskxgboost', 'slugxgboost', 'sluglgbm','slugrf']

ens_mdl = Ensemble(   
                                xgb_objective='count:poisson',  # ["reg:squarederror", "count:poisson", "binary:logistic",  "binary:hinge" ]
                                lgbm_objective='poisson',    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
                                pred_class='regression',
                                score_func=None,
                                metric_func=None,
                                list_base_models=['sluglgbm', 'briskxgboost'],
                                n_trials=10,          ### common param
                                epochs=15,             ### ANN param
                                boosted_round=10,      ### boosting tree param
                                max_depth=30,          ### boosting tree param
                                max_n_estimators=1500, ### rf param
                                n_estimators=30,       ### bagging param, must be > 10 
                                n_neighbors=30,        ### knn param, must be > 5

                                ensemble_n_estimators=30,  ###  must be > 10
                                ensemble_n_trials=10,
                                timeout=None
                 )

In [6]:
ens_mdl.fetch_models(X_train, X_test, y_train, y_test)

[2m[36m(run pid=653091)[0m 2022-12-07 07:53:49,749	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'cv': KFold(n_splits=3, random_state=None, shuffle=False), 'early_stop_type': <EarlyStopping.NO_EARLY_STOP: 7>, 'scoring/score': <function _passthrough_scorer at 0x7f2359b15b80>}
[2m[36m(run pid=653091)[0m 2022-12-07 07:53:53,557	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'cv': KFold(n_splits=3, random_state=None, shuffle=False), 'early_stop_type': <EarlyStopping.NO_EARLY_STOP: 7>, 'scoring/score': <function _passthrough_scorer at 0x7f2359b15b80>}
[2m[36m(run pid=653091)[0m 2022-12-07 07:53:57,531	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'cv': KFold(n_splits=3, random_state=None, shuffle=False), 'early_stop_type': <EarlyStopping.NO_EARLY_STOP: 7>, 'scoring/score': <function _passthrough_scorer at 0x7f2359b15b

In [7]:
len(ens_mdl.base_models)

2

In [8]:
# ens_mdl.base_models[0].gs.best_estimator

In [9]:
attr_algos = ['IG', 'SHAP', 'GradientSHAP'] #, 'knockoff']

In [10]:
ex = Explainable(ens_mdl, X_train)             

In [11]:
ex.get_attr(attr_algos)

2022-12-07T08:54:24CET : INFO : explainable_tune : get_attr : 153 : Message : attribution methods  ['ig', 'shap', 'gradientshap']
2022-12-07T08:54:24CET : INFO : explainable_tune : get_attr : 156 : Message : calculating variable importance on  slug_lgbm
2022-12-07T08:54:25CET : INFO : explainable_tune : get_attr : 156 : Message : calculating variable importance on  brisk_xgboost
[2m[36m(__get_shapley_ensemble_attr__ pid=657056)[0m ntree_limit is deprecated, use `iteration_range` or model slicing instead.


cols
IP         62.5
index      58.5
TIME_ID    53.5
RGEO       31.5
SHOT       31.0
           ... 
BGASZ       0.0
BGASA       0.0
NELFORM     0.0
DNELDT      0.0
TOK         0.0
Length: 64, dtype: float64

In [38]:
ens_mdl.base_models[1].gs.best_estimator

KNeighborsRegressor(algorithm='kd_tree', n_neighbors=7, weights='distance')

In [29]:
ens_mdl.ensemble.gs.best_estimator

BaggingRegressor(n_estimators=27)

In [None]:
ens_mdl.scores

[0.907852251809679, 0.9023357075419954]

In [39]:
ens_mdl.predict(X_test)

array([241357.14285714, 164032.85714286, 126121.42857143, 197900.        ,
        94500.        , 122950.        , 271900.        , 179028.57142857,
       525315.28571429, 104935.71428571, 169964.28571429, 241321.42857143,
       218753.57142857, 124928.57142857, 141285.71428571, 145000.        ,
       232383.57142857,  84214.28571429, 148285.71428571, 149071.42857143,
       169392.85714286, 137500.        , 124557.14285714, 174064.28571429,
       197628.57142857, 161517.85714286, 150953.57142857,  82428.57142857,
       266268.07142857, 143885.71428571, 246229.85714286, 193135.71428571,
       129500.        , 242035.71428571, 267855.64285714, 194921.42857143,
       177570.35714286, 126642.85714286, 203685.71428571, 302335.71428571,
       243500.        , 127428.57142857, 172381.07142857, 284242.07142857,
       304321.42857143, 218815.71428571, 127464.28571429, 133857.14285714,
       176000.        , 106000.        , 284785.71428571, 146142.85714286,
       154678.57142857,  

2022-12-06 22:31:32,381	ERROR dataclient.py:323 -- Unrecoverable error in data channel.
