In [2]:
# prologue
import forest_surveyor.reproducible as rp
# from forest_surveyor.experiments import grid_experiment_mp
# from forest_surveyor.routines import *

In [4]:
# several datasets are available as pre-prepared containers
# containers hold the data and some meta-data
# it saves time to pre-process this meta-data
# any dataset can be turned into a container by invoking the constructor
# the constructor is found in the file structures.py

# this is a list of constructors with pre-configured parameters
# and data that is stored locally in the package
rp.datasets

[<function forest_surveyor.datasets.adult_small_samp_data>,
 <function forest_surveyor.datasets.bankmark_samp_data>,
 <function forest_surveyor.datasets.car_data>,
 <function forest_surveyor.datasets.cardio_data>,
 <function forest_surveyor.datasets.credit_data>,
 <function forest_surveyor.datasets.german_data>,
 <function forest_surveyor.datasets.lending_tiny_samp_data>,
 <function forest_surveyor.datasets.nursery_samp_data>,
 <function forest_surveyor.datasets.rcdv_samp_data>]

In [None]:
# helpers for now
dataset = datasets[0]
project_dir = 'V:\\whiteboxing'
random_state = 123
override_tuning = True
add_trees = 0
eval_model = False

# general data preparation
mydata = rp.datasets[0](random_state=random_state, project_dir=project_dir)

# train test split - one off hard-coded
# the random_state is not used here. We vary the random state on each forest
train_index, test_index = mydata.get_tt_split_idx(random_state=123)
tt = mydata.tt_split_by_idx(train_index, test_index).to_dict()
# save these for use in R and G-REX
Series(train_index).to_csv(path = mydata.get_save_path() + 'train_index.csv', index=False)
Series(test_index).to_csv(path = mydata.get_save_path() + 'test_index.csv', index=False)

################ PARAMETER TUNING ###################
############ Only runs when required ################
#####################################################

best_params = tune_rf(tt['X_train_enc'], tt['y_train'],
 save_path = mydata.get_save_path(),
 random_state=mydata.random_state,
 override_tuning=override_tuning)

#####################################################

# update best params according to expermental design
best_params['n_estimators'] = best_params['n_estimators'] + add_trees

# train a rf model
rf, enc_rf = train_rf(X=tt['X_train_enc'], y=tt['y_train'],
 best_params=best_params,
 encoder=tt['encoder'],
 random_state=mydata.random_state)

if eval_model:
    cm, acc, coka, prfs = evaluate_model(prediction_model=enc_rf, X=tt['X_test'], y=tt['y_test'],
                 class_names=mydata.get_label(mydata.class_col, [i for i in range(len(mydata.class_names))]).tolist(),
                 plot_cm=True, plot_cm_norm=True)
else:
    cm, acc, coka, prfs = evaluate_model(prediction_model=enc_rf, X=tt['X_test'], y=tt['y_test'],
                 class_names=mydata.get_label(mydata.class_col, [i for i in range(len(mydata.class_names))]).tolist(),
                 plot_cm=False, plot_cm_norm=False)


In [None]:
random_state = np.tile(np.repeat([i + 123 for i in range(30)], len(datasets)), 180)
add_trees = np.repeat([i * 200 for i in range(6)], len(datasets) * 30)
override_tuning = np.concatenate((np.tile(np.array([True]), 30 * len(datasets)), np.tile(np.array([False]), 5 * 30 * len(datasets))))
alpha_scores = np.tile([0.5], 180 * len(datasets))
alpha_paths = np.tile([0.5], 180 * len(datasets))
support_paths = np.tile([0.05], 180 * len(datasets))
run_anchors = np.tile([False], 180 * len(datasets))
precis_threshold = np.tile([0.95], 180 * len(datasets))
n_instances = np.tile([500], 180 * len(datasets))
n_batches = np.tile([1], 180 * len(datasets))
eval_model = np.tile([False], 180 * len(datasets))
which_trees = np.tile(['majority'], 180 * len(datasets))
disc_path_bins = np.tile([4], 180 * len(datasets))
disc_path_eqcounts = np.tile([False], 180 * len(datasets))
iv_low = np.tile([random_state.min()], 180 * len(datasets))
iv_high = np.tile([random_state.max() + 1], 180 * len(datasets))
weighting = np.tile(['chisq'], 180 * len(datasets))
greedy = np.tile(['precision'], 180 * len(datasets))
forest_walk_async = np.tile([True], 180 * len(datasets))
chirps_explanation_async = np.tile([True], 180 * len(datasets))
project_dir = np.tile(['V:\\whiteboxing'], 180 * len(datasets))
save_rule_accs = np.tile([False], 180 * len(datasets))

datasets = np.tile(datasets, 180)

In [None]:
dataset = datasets[0]
project_dir = 'V:\\whiteboxing'
random_state = 123
override_tuning = True
add_trees = 0
eval_model = False

# general data preparation
mydata = dataset(random_state=random_state, project_dir=project_dir)

In [None]:
from os import makedirs as mkdir
from pathlib import Path as pth
pth(mydata.get_save_path())

In [None]:
if not pth(mydata.get_save_path()).is_dir():
    mkdir(mydata.get_save_path())