In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# !pip install setuptools wheel -U
# !pip install torch -U
# !pip install scikit-multiflow
# !pip install setuptools
# !pip install tqdm
# !pip install kneefinder
# %pip install numpy_indexed
# %pip install river
# %pip install python_nameof
# %pip install jsonpickle
# !pip install capymoa
# %pip install shap
# %pip install yaml

## 1. Imports and low level con

In [None]:
import numpy as np
import numpy_indexed as npi
import pandas as pd
import time
import os

import river
from river.drift.binary import DDM, EDDM
from river.drift import ADWIN, PageHinkley

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator

from ExperimentControlFlow import *

num_cpus = os.cpu_count(); print(f"Number of CPUs: {num_cpus}")
np.__version__, pd.__version__, time.strftime("%D %H:%M:%S", time.localtime())

## 2. Execute experiment


In [None]:
WARM_UP_WINDOW_SIZE = 1000    # this actually will have to vary by model tested
INSPECTOR_WINDOW_SIZE = 500  # this is treated as a hyperparameter
MEMORY_SIZE = 15        # this is treated as a hyperparameter

results_of_runs = []

In [None]:
cust_params_list_standard = [("SHAP_noScale_noModel",(True,True,True,False,False)),("SHAP_noScale_Model",(True,True,True,False,True))]

# cust_params_list_noSHAP = [("NoCluster_noScale",(False,False,False,False,False)),("DBSCAN_Scale_noModel",(False,False,True,True,False)),
#     ("DBSCAN_noScale_Model",(False,False,True,False,True)),("DBSCAN_Scale_Model",(False,False,True,True,True))]    
# cust_params_list_SHAP_noPoint = [("SHAP_noScale_noModel",(True,False,True,False,False)),("SHAP_Scale_noModel",(True,False,True,True,False)),
#     ("SHAP_noScale_Model",(True,False,True,False,True)),("SHAP_Scale_Model",(True,False,True,True,True))]
# cust_params_list_SHAP_Point = [("SHAP_noScale_noModel",(True,True,True,False,False)),("SHAP_Scale_noModel",(True,True,True,True,False)),
#     ("SHAP_noScale_Model",(True,True,True,False,True)),("SHAP_Scale_Model",(True,True,True,True,True))]

In [None]:
path_dir = "../datasets/chocolate/"

path_drift_set = [
    ('nonlinear_abrupt_chocolaterotation_multi.csv',[5000,10000,15000,20000]),
    ('nonlinear_abrupt_chocolaterotation_noise_and_redunce_binary.csv',[4800,9600,14400,19200]),
    ('nonlinear_sudden_chocolaterotation_binary.csv',[5000,10000,15000,20000]),
    ('nonlinear_sudden_chocolaterotation_noise_multi.csv',[4800,9600,14400,19200])
]

In [None]:
import time
time.ctime()

# Solely supervised

In [None]:
# Execute ad-hoc experiment 
DO_AVG=True
limit_size=75000
oracle_compliance_list=[.0015]
inspector_curiosity = .01
run_count=5

seeds = list(range(1000))

# from sklearn.svm import SVC

if DO_AVG:
    # Before running the experiment, check disk space
    import shutil
    total, used, free = shutil.disk_usage(".")
    print(f"Disk space: {free // (2**20)} MB free")
    if free < 100 * 2**20:  # less than 100MB free
        raise OSError("Not enough disk space to run the experiment. Please free up space and try again.")
    
    
    res_out1 = []

    # model_reference = RandomForestClassifier(random_state=42)
    model_object = RandomForestClassifier(random_state=42)
    
    for prefix, conf in cust_params_list_standard:
        for train_forward in [True,False]:
            for oracle_compliance in oracle_compliance_list:
                for path, drift_golden_source in path_drift_set:                
                    full_path = path_dir+path
                    drift_detector = river.drift.PageHinkley()
                    clustering_params_list={"weight":1,"use_SHAP_to_cluster":conf[0],"combine_SHAP_with_point":conf[1],
                                            "perform_clustering":conf[2],"scale_for_clustering":conf[3],"use_model_pred":conf[4]}
                    
                
                    run_config = {"file_path":full_path,"training_size":WARM_UP_WINDOW_SIZE,"memory_size":MEMORY_SIZE,"insp_window_size":INSPECTOR_WINDOW_SIZE,
                                "cluster_method":None,"model_object":model_object, "drift_detector":drift_detector, "keep_known_labels":False,
                                "inspector_curiosity_factor":inspector_curiosity, "oracle_compliance":oracle_compliance, "inspector_model":RandomForestClassifier(n_estimators=100,random_state=42),#KNeighborsClassifier()#
                                "clustering_params_list":clustering_params_list, "train_forward":train_forward, "run_quiet":False, "dont_propagate":False}
                    
                    forw_back_pref = ("forw" if run_config["train_forward"] else "back")

                    outcomes = combined_run(run_config,max_size=limit_size,skip_initial=0,prefix=f"{limit_size}_{forw_back_pref}_{prefix}",repetitions=run_count,random_states=seeds[:run_count],save=True)

                    drift_golden_source_capped = [drift for drift in drift_golden_source if drift < LIMIT_DATASET_SIZE]
                    agg_res = score_outcomes(outcomes,drift_golden_source,skip_initial=0,train_forward=run_config["train_forward"],train_size=WARM_UP_WINDOW_SIZE,mem_size=MEMORY_SIZE)            
                    agg_res["files"] = [outcome[0] for outcome in outcomes]
                    print(f"\ncumultive results: {agg_res}")
                    
                    saved_file = serialise_aggregate_results(run_config,agg_res,f"{forw_back_pref}_{prefix}")
                    print(f"aggregate_file={saved_file}\n\n")
                    res_out1 += [saved_file]
            
            
    print(f"Results gathered\n{res_out1}")



In [None]:
aggregate_results_to_df(res_out1,save_as_name="SHAP_4.csv", rescore_drift=False)