In [1]:
import subprocess
import time

In [2]:
from DatasetManager import DatasetManager

  from ._conv import register_converters as _register_converters
Using Theano backend.


# Hyperparameter optimization with unstructured data

In [6]:
datasets = ["dc", "crm2", "github"]

In [7]:
bucket_encs = ["single_index"]
text_method_encs = ["bong_agg"]
cls_methods = ["xgboost", "rf"]

"""
bucket_methods = ["single"]
cls_encodings = ["laststate"]
text_methods = ["bong"]
cls_methods = ["xgboost"]
"""

'\nbucket_methods = ["single"]\ncls_encodings = ["laststate"]\ntext_methods = ["bong"]\ncls_methods = ["xgboost"]\n'

In [8]:
experiments_filename = "experiments_optimize_params_with_unstructured_data.py"

In [9]:
for dataset in datasets:
    if dataset == "dc":
        memory = 30000
    else:
        memory = 50000
        
    for bucket_enc in bucket_encs:
        
        for text_method_enc in text_method_encs:
            
            for cls_method in cls_methods:

                if "bong" in text_method_enc or "nb" in text_method_enc:
                    n_iter = 30
                else:
                    n_iter = 20

                if cls_method in ["rf", "logit"]:
                    n_iter += 10
                else:
                    n_iter += 50
                
                min_prefix_length = 1
                if bucket_enc == "prefix_index":
                    if dataset == "github":
                        max_prefix_length = 7
                    elif dataset == "crm2":
                        max_prefix_length = 33
                    else:
                        # read the data
                        dataset_manager = DatasetManager(dataset)
                        data = dataset_manager.read_dataset()

                        # determine min and max (truncated) prefix lengths
                        if "traffic_fines" in dataset:
                            max_prefix_length = 10
                        elif "bpic2017" in dataset:
                            max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
                        else:
                            max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

                    for nr_events in range(min_prefix_length, max_prefix_length+1):
                        cur_method_name = "%s_%s" % (bucket_enc, nr_events)

                        output_filename = "output_files/output_%s_%s_%s_%s.txt"%(dataset, cur_method_name, text_method_enc, cls_method)
                        script_filename = "script_files/%s_%s_%s_%s.sh" % (dataset, cur_method_name, text_method_enc, cls_method)

                        with open(script_filename, "w") as fout:
                            fout.write("#!/bin/bash\n")
                            fout.write("#SBATCH --output=%s\n" % (output_filename))
                            fout.write("#SBATCH --mem=%s\n" % memory)
                            fout.write("#SBATCH --ntasks=4\n")
                            fout.write("#SBATCH --time=7-00\n")

                            fout.write("python %s %s %s %s %s %s" % (experiments_filename,
                                                                              dataset,
                                                                              cur_method_name,
                                                                              text_method_enc,
                                                                              cls_method,
                                                                              n_iter))

                        time.sleep(1)
                        subprocess.Popen(("sbatch %s" % script_filename).split())

                else:
                
                    output_filename = "output_files/output_%s_%s_%s_%s.txt"%(dataset, bucket_enc, text_method_enc, cls_method)
                    script_filename = "script_files/%s_%s_%s_%s.sh" % (dataset, bucket_enc, text_method_enc, cls_method)

                    with open(script_filename, "w") as fout:
                        fout.write("#!/bin/bash\n")
                        fout.write("#SBATCH --output=%s\n" % (output_filename))
                        fout.write("#SBATCH --mem=%s\n" % memory)
                        fout.write("#SBATCH --ntasks=4\n")
                        fout.write("#SBATCH --time=7-00\n")

                        fout.write("python %s %s %s %s %s %s" % (experiments_filename,
                                                                          dataset,
                                                                          bucket_enc,
                                                                          text_method_enc,
                                                                          cls_method,
                                                                          n_iter))

                    time.sleep(1)
                    subprocess.Popen(("sbatch %s" % script_filename).split())


# Hyperparameter optimization for LSTM

In [15]:
method_names = ["lstm"]
cls_methods = ["lstm"]
results_dir = "val_results_lstm"
datasets = ["sepsis_cases_1", "production", #"sepsis_cases_2", 
            "sepsis_cases_4",
            "traffic_fines_1", "bpic2012_declined", "bpic2012_accepted", #"bpic2012_cancelled",
            "bpic2017_accepted", "bpic2017_cancelled", "bpic2017_refused", "hospital_billing_3"]
datasets = ["bpic2011_f1", "bpic2011_f2", "bpic2011_f3", "bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2",
            "bpic2015_3_f2", "bpic2015_4_f2", "bpic2015_5_f2", "hospital_billing_2", "insurance_activity", "insurance_followup",
           "unemployment"]
datasets = ["dc", "github", "crm2"]
n_iter = 30

In [17]:
for dataset_name in datasets:
    if "hospital_billing" in dataset_name or "bpic2017" in dataset_name or dataset_name in ["dc", "github", "crm2"]:
        memory = 50000
    elif "traffic" in dataset_name or "unemployment" in dataset_name:
        memory = 30000
    elif "bpic" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for method_name in method_names:
        for cls_method in cls_methods:
            params = " ".join([dataset_name, method_name, cls_method, str(n_iter)])
            script_file = "script_files_lstm/run_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --partition=gpu\n")
                fout.write("#SBATCH --gres=gpu:1\n")
                fout.write("#SBATCH --output=output_files/output_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --time=7-00\n")
                fout.write("source activate cuda8_env\n")
                fout.write("python experiments_optimize_params_lstm.py %s" % params)

            #time.sleep(1)
            #subprocess.Popen(("sbatch %s" % script_file).split())

# Hyperparameter optimization for inter-run stability

In [10]:
method_names = ["single_agg"]
cls_methods = ["rf"]
datasets = ["bpic2011_f1", "bpic2011_f2", "bpic2011_f3", "bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2",
           "bpic2015_3_f2", "bpic2015_4_f2", "bpic2015_5_f2", "insurance_activity", "insurance_followup"]
datasets = ["bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2", "insurance_followup"]
datasets = ["hospital_billing_2", "hospital_billing_3"]
n_runs = 2
alpha = 1

In [11]:
for dataset_name in datasets:
    if dataset_name in ["dc", "crm2", "github"]:
        experiments_filename = "experiments_optimize_params_single_multirun_unstructured.py"
    else:
        experiments_filename = "experiments_optimize_params_single_multirun.py"
    
    if "hospital_billing" in dataset_name or "bpic2017" in dataset_name or dataset_name in ["dc", "crm2", "github"]:
        memory = 50000
    elif "traffic" in dataset_name:
        memory = 50000
    else:
        memory = 20000
        
    for method_name in method_names:
        for cls_method in cls_methods:
            
            if cls_method == "rf":
                n_iter = 10
            elif cls_method == "xgboost":
                n_iter = 50
                
            if "hospital" in dataset_name or "bpic2017" in dataset_name or "crm2" in dataset_name:
                if cls_method == "rf":
                    n_iter = 8
                elif cls_method == "xgboost":
                    n_iter = 20
            
            for beta in [0]:
                params_dir = "val_results_auc1_rmspd%s" % beta
            
                params = " ".join([dataset_name, method_name, cls_method, str(n_runs), str(n_iter), str(alpha), str(beta), params_dir])
                script_file = "script_files/run_%s_%s_%s_%s_%s.sh" % (dataset_name, method_name, cls_method, alpha, beta)
                print(script_file)
                with open(script_file, "w") as fout:
                    fout.write("#!/bin/bash\n")
                    fout.write("#SBATCH --output=output_files/output_%s_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                               cls_method, alpha, beta))
                    fout.write("#SBATCH --mem=%s\n" % memory)
                    fout.write("#SBATCH --ntasks=4\n")
                    fout.write("#SBATCH --time=7-00\n")

                    fout.write("python %s %s" % (experiments_filename, params))

                time.sleep(1)
                subprocess.Popen(("sbatch %s" % script_file).split())

script_files/run_hospital_billing_2_single_agg_rf_1_0.sh
script_files/run_hospital_billing_3_single_agg_rf_1_0.sh


# Hyperparameter optimization (regular)

In [3]:
from DatasetManager import DatasetManager

In [4]:
method_names = ["single_agg"]
cls_methods = ["xgboost"]
datasets = ["unemployment"]

experiments_filename = "experiments_optimize_params.py"

In [5]:
for dataset_name in datasets:
    if "hospital_billing" in dataset_name or "bpic2017" in dataset_name or "unemployment" in dataset_name:
        memory = 50000
    elif "traffic" in dataset_name or "crm2" in dataset_name or "github" in dataset_name:
        memory = 50000
    else:
        memory = 20000
        
    for cls_method in cls_methods:

        if cls_method == "rf":
            n_iter = 10
        elif cls_method == "xgboost":
            n_iter = 50
        
        for method_name in method_names:
            min_prefix_length = 1
            if "prefix_index" in method_name:
                if dataset_name == "github":
                    max_prefix_length = 7
                elif dataset_name == "crm2":
                    max_prefix_length = 33
                else:
                    # read the data
                    dataset_manager = DatasetManager(dataset_name)
                    data = dataset_manager.read_dataset()

                    # determine min and max (truncated) prefix lengths
                    if "traffic_fines" in dataset_name:
                        max_prefix_length = 10
                    elif "bpic2017" in dataset_name:
                        max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
                    else:
                        max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

                for nr_events in range(min_prefix_length, max_prefix_length+1):
                    cur_method_name = "%s_%s" % (method_name, nr_events)

                    params = " ".join([dataset_name, cur_method_name, cls_method, str(n_iter)])
                    script_file = "script_files/run_%s_%s_%s.sh" % (dataset_name, cur_method_name, cls_method)
                    print(script_file)
                    with open(script_file, "w") as fout:
                        fout.write("#!/bin/bash\n")
                        fout.write("#SBATCH --output=output_files/output_%s_%s_%s.txt\n" % (dataset_name, cur_method_name,
                                                                                   cls_method))
                        fout.write("#SBATCH --mem=%s\n" % memory)
                        fout.write("#SBATCH --ntasks=4\n")
                        fout.write("#SBATCH --time=7-00\n")

                        fout.write("python %s %s" % (experiments_filename, params))

                    time.sleep(1)
                    subprocess.Popen(("sbatch %s" % script_file).split())
                    
            else:
                params = " ".join([dataset_name, method_name, cls_method, str(n_iter)])
                script_file = "script_files/run_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
                print(script_file)
                with open(script_file, "w") as fout:
                    fout.write("#!/bin/bash\n")
                    fout.write("#SBATCH --output=output_files/output_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                               cls_method))
                    fout.write("#SBATCH --mem=%s\n" % memory)
                    fout.write("#SBATCH --ntasks=4\n")
                    fout.write("#SBATCH --time=7-00\n")

                    fout.write("python %s %s" % (experiments_filename, params))

                time.sleep(1)
                subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_unemployment_single_agg_xgboost.sh


# Final results (regular)

In [18]:
method_names = ["prefix_index", "prefix_agg"]
cls_methods = ["xgboost"]
datasets = ["production", "sepsis_cases_4", "traffic_fines_1", "bpic2012_accepted", "bpic2012_declined", "bpic2011_f4", "bpic2015_2_f2", "insurance_activity",
           "bpic2017_refused", "hospital_billing_3"]
truncate = "nottrunc"

experiments_filename = "experiments.py"

In [19]:
for dataset_name in datasets:
    if "hospital_billing" in dataset_name or "bpic2017" in dataset_name:
        memory = 30000
    elif "traffic" in dataset_name or "crm2" in dataset_name:
        memory = 30000
    else:
        memory = 20000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method, truncate])
            script_file = "script_files/run_final_%s_%s_%s_%s.sh" % (dataset_name, method_name, cls_method, truncate)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_final_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method, truncate))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=4\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_final_production_prefix_index_xgboost_nottrunc.sh
script_files/run_final_production_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_sepsis_cases_4_prefix_index_xgboost_nottrunc.sh
script_files/run_final_sepsis_cases_4_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_traffic_fines_1_prefix_index_xgboost_nottrunc.sh
script_files/run_final_traffic_fines_1_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_bpic2012_accepted_prefix_index_xgboost_nottrunc.sh
script_files/run_final_bpic2012_accepted_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_bpic2012_declined_prefix_index_xgboost_nottrunc.sh
script_files/run_final_bpic2012_declined_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_bpic2011_f4_prefix_index_xgboost_nottrunc.sh
script_files/run_final_bpic2011_f4_prefix_agg_xgboost_nottrunc.sh
script_files/run_final_bpic2015_2_f2_prefix_index_xgboost_nottrunc.sh
script_files/run_final_bpic2015_2_f2_prefix_agg_xgboost_nottrunc.sh
script_files/run_final

# Final results (unstructured)

In [5]:
method_names = ["single_index"]
text_method_encs = ["bong_agg"]
datasets = ["dc"]
cls_methods = ["xgboost"]

experiments_filename = "experiments_with_unstructured_data.py"
#experiments_filename = "experiments_with_unstructured_data_fixed_params.py"

In [6]:
for dataset_name in datasets:
    if "github" in dataset_name or "crm2" in dataset_name:
        memory = 50000
    else:
        memory = 25000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            for text_method in text_method_encs:
            
                params = " ".join([dataset_name, method_name, text_method, cls_method])
                script_file = "script_files/run_final_%s_%s_%s_%s.sh" % (dataset_name, method_name, text_method, cls_method)
                print(script_file)
                with open(script_file, "w") as fout:
                    fout.write("#!/bin/bash\n")
                    fout.write("#SBATCH --output=output_files/output_final_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                               text_method, cls_method))
                    fout.write("#SBATCH --mem=%s\n" % memory)
                    fout.write("#SBATCH --ntasks=4\n")
                    fout.write("#SBATCH --time=7-00\n")

                    fout.write("python %s %s" % (experiments_filename, params))

                time.sleep(1)
                subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_final_dc_single_index_bong_agg_xgboost.sh


# Writing predictions (stability)

In [3]:
method_names = ["single_index"]
cls_methods = ["xgboost", "xgboost_calibrated"]
datasets = ["bpic2011", "bpic2015", "insurance", "sepsis_cases", "production", "bpic2012_accepted", "bpic2012_declined", 
           "bpic2012_cancelled", "traffic_fines_1", "bpic2017_accepted", "bpic2017_refused", "bpic2017_cancelled",
           "hospital_billing_2", "hospital_billing_3"]
datasets = ["crm2"]

In [4]:
for dataset_name in datasets:
    if dataset_name in ["dc", "crm2", "github"]:
        experiments_filename = "experiments_write_predictions_stability_unstructured.py"
    else:
        experiments_filename = "experiments_write_predictions_stability.py"
    
    if dataset_name in ["crm2", "github"]:
        memory = 50000
    elif "hospital_billing" in dataset_name or "bpic2017" in dataset_name or dataset_name == "dc":
        memory = 50000
    elif "traffic" in dataset_name or "crm2" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method])
            script_file = "script_files/run_writepreds_stab_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_writepreds_stab_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=4\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_writepreds_stab_crm2_single_index_xgboost.sh
script_files/run_writepreds_stab_crm2_single_index_xgboost_calibrated.sh


# Writing predictions (LSTM)

In [19]:
method_names = ["lstm"]
cls_methods = ["lstm", "lstm_calibrated"]
datasets = ["sepsis_cases_4",
            "traffic_fines_1", "bpic2012_declined", "bpic2012_accepted",
            "bpic2017_cancelled", "bpic2017_refused", "hospital_billing_3",
            "bpic2011_f1", "bpic2011_f2", "bpic2011_f3", "bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2",
            "bpic2015_3_f2", "bpic2015_4_f2", "bpic2015_5_f2", "hospital_billing_2", "insurance_activity", 
            "insurance_followup"]
datasets = ["bpic2017_accepted", "bpic2017_refused", "hospital_billing_2", "hospital_billing_3"]
datasets = ["crm2"]

experiments_filename = "experiments_write_predictions_lstm.py"

In [21]:
for dataset_name in datasets:
    if "github" in dataset_name or "crm2" in dataset_name:
        memory = 50000
    elif "hospital_billing" in dataset_name or "bpic2017" in dataset_name:
        memory = 30000
    elif "traffic" in dataset_name or "bpic2012" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for method_name in method_names:
        for cls_method in cls_methods:
            params = " ".join([dataset_name, method_name, cls_method])
            script_file = "script_files_lstm/run_writepreds_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --partition=gpu\n")
                fout.write("#SBATCH --gres=gpu:1\n")
                fout.write("#SBATCH --output=output_files/output_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --time=7-00\n")
                fout.write("python %s %s" % (experiments_filename, params))

            #time.sleep(1)
            #subprocess.Popen(("sbatch %s" % script_file).split())

# Writing predictions (alarms)

In [40]:
method_names = ["single_agg"]
cls_methods = ["xgboost"]
datasets = ["bpic2011", "bpic2015", "insurance", "sepsis_cases", "production", "bpic2012_accepted", "bpic2012_declined", 
           "bpic2012_cancelled", "traffic_fines_1", "bpic2017_accepted", "bpic2017_refused", "bpic2017_cancelled",
           "hospital_billing_2", "hospital_billing_3"]
datasets = ["sepsis_cases"]
datasets = ["unemployment"]
datasets = ["dc", "github", "crm2"]

In [41]:
for dataset_name in datasets:
    if dataset_name in ["dc", "github", "crm2"]:
        experiments_filename = "experiments_write_predictions_alarms_unstructured.py"
    else:
        experiments_filename = "experiments_write_predictions_alarms.py"
    
    if dataset_name in ["dc", "github", "crm2"]:
        memory = 50000
    elif "hospital_billing" in dataset_name or "bpic2017" in dataset_name:
        memory = 30000
    elif "traffic" in dataset_name or "crm2" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method])
            script_file = "script_files/run_writepreds_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_writepreds_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=4\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_writepreds_dc_single_agg_xgboost.sh
script_files/run_writepreds_github_single_agg_xgboost.sh
script_files/run_writepreds_crm2_single_agg_xgboost.sh


# Test results inter-run stability

In [8]:
method_names = ["single_agg"]
cls_methods = ["rf_calibrated"]
datasets = ["sepsis_cases", "production", "bpic2012_accepted", "bpic2012_declined", 
           "bpic2012_cancelled"]
datasets = ["bpic2017_accepted", "bpic2017_refused", "bpic2017_cancelled"]
datasets = ["bpic2011_f2", "bpic2011_f3", "bpic2015_4_f2", "bpic2015_5_f2",
            "insurance_activity"]
datasets = ["hospital_billing_2", "hospital_billing_3"]

alpha = 1
beta = 5

In [9]:
for dataset_name in datasets:
    if dataset_name in ["dc", "github", "crm2"]:
        experiments_filename = "experiments_test_interrun_stability_unstructured.py"
    else:
        experiments_filename = "experiments_test_interrun_stability.py"
    
    if dataset_name in ["github", "crm2"]:
        memory = 50000
    elif "hospital_billing" in dataset_name or "bpic2017" in dataset_name or dataset_name == "dc":
        memory = 30000
    elif "traffic" in dataset_name or "crm2" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method, str(alpha), str(beta)])
            script_file = "script_files/run_interrun_%s_%s_%s_%s_%s.sh" % (dataset_name, method_name, cls_method, alpha, beta)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_interrun_%s_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method, alpha, beta))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=4\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_interrun_hospital_billing_2_single_agg_rf_calibrated_1_5.sh
script_files/run_interrun_hospital_billing_3_single_agg_rf_calibrated_1_5.sh


# Optimize alarm threshold

In [9]:
method_names = ["single_agg"]
cls_methods = ["xgboost"]
datasets = ["insurance_activity", "insurance_followup", "sepsis_cases_1", "sepsis_cases_2", "sepsis_cases_4",
            "production", "bpic2012_accepted", "bpic2012_declined", "bpic2012_cancelled", "bpic2011_f1", 
            "bpic2011_f2", "bpic2011_f3", "bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2",
           "bpic2015_3_f2", "bpic2015_4_f2", "bpic2015_5_f2", "bpic2017_cancelled", "bpic2017_refused", 
            "bpic2017_accepted", "traffic_fines_1", "hospital_billing_2", "hospital_billing_3"]
datasets = ["insurance_followup", "bpic2012_declined"]
datasets = ["crm2"]

#experiments_filename = "experiments_optimize_alarm_threshold.py"
#experiments_filename = "experiments_optimize_alarm_threshold_ccom.py"
experiments_filename = "experiments_optimize_alarm_threshold_eff.py"

In [10]:
for dataset_name in datasets:
    memory = 10000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method])
            script_file = "script_files/run_optthresh_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_optthresh_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=1\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_optthresh_crm2_single_agg_xgboost.sh


# Test alarm thresholds

In [14]:
method_names = ["single_agg"]
cls_methods = ["xgboost"]
datasets = ["insurance_activity", "insurance_followup", "sepsis_cases_1", "sepsis_cases_2", "sepsis_cases_4",
            "production", "bpic2012_accepted", "bpic2012_declined", "bpic2012_cancelled", "bpic2011_f1", 
            "bpic2011_f2", "bpic2011_f3", "bpic2011_f4", "bpic2015_1_f2", "bpic2015_2_f2",
           "bpic2015_3_f2", "bpic2015_4_f2", "bpic2015_5_f2", "bpic2017_cancelled", "bpic2017_refused", 
            "bpic2017_accepted", "traffic_fines_1", "hospital_billing_2", "hospital_billing_3",
            "github", "dc", "crm2"]

#experiments_filename = "experiments_test_optimal_alarm_threshold_eff.py"
experiments_filename = "experiments_test_optimal_alarm_threshold_ccom.py"
#experiments_filename = "experiments_test_optimal_alarm_threshold.py"
#experiments_filename = "experiments_test_fixed_alarm_thresholds.py"

In [15]:
for dataset_name in datasets:
    memory = 10000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method])
            script_file = "script_files/run_testalarm_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_testalarm_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=1\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_testalarm_insurance_activity_single_agg_xgboost.sh
script_files/run_testalarm_insurance_followup_single_agg_xgboost.sh
script_files/run_testalarm_sepsis_cases_1_single_agg_xgboost.sh
script_files/run_testalarm_sepsis_cases_2_single_agg_xgboost.sh
script_files/run_testalarm_sepsis_cases_4_single_agg_xgboost.sh
script_files/run_testalarm_production_single_agg_xgboost.sh
script_files/run_testalarm_bpic2012_accepted_single_agg_xgboost.sh
script_files/run_testalarm_bpic2012_declined_single_agg_xgboost.sh
script_files/run_testalarm_bpic2012_cancelled_single_agg_xgboost.sh
script_files/run_testalarm_bpic2011_f1_single_agg_xgboost.sh
script_files/run_testalarm_bpic2011_f2_single_agg_xgboost.sh
script_files/run_testalarm_bpic2011_f3_single_agg_xgboost.sh
script_files/run_testalarm_bpic2011_f4_single_agg_xgboost.sh
script_files/run_testalarm_bpic2015_1_f2_single_agg_xgboost.sh
script_files/run_testalarm_bpic2015_2_f2_single_agg_xgboost.sh
script_files/run_testalarm_bpic2015_3_f2

# Training stability

In [12]:
method_names = ["single_agg"]
cls_methods = ["rf", "xgboost"]
datasets = ["crm2"]

In [13]:
for dataset_name in datasets:
    if dataset_name == "crm2":
        memory = 50000
    elif "hospital_billing" in dataset_name or "bpic2017" in dataset_name or dataset_name == "github":
        memory = 30000
    elif "traffic" in dataset_name:
        memory = 20000
    else:
        memory = 10000
        
    for method_name in method_names:
        for cls_method in cls_methods:
            
            script_file = "script_files/run_training_stability_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files_training_stability/output_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --ntasks=8\n")
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python experiments_training_stability.py %s %s %s" % (dataset_name, method_name, cls_method))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())

# Feature importances (with unstructured data)

In [80]:
method_names = ["single_laststate"]
text_method_encs = ["bong"]
datasets = ["dc", "crm2"]
cls_methods = ["rf"]

experiments_filename = "experiments_with_unstructured_data_feature_importance.py"

In [81]:
for dataset_name in datasets:
    if "github" in dataset_name or "crm2" in dataset_name:
        memory = 50000
    else:
        memory = 25000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            for text_method in text_method_encs:
            
                params = " ".join([dataset_name, method_name, text_method, cls_method])
                script_file = "script_files/run_feature_imp_%s_%s_%s_%s.sh" % (dataset_name, method_name, text_method, cls_method)
                print(script_file)
                with open(script_file, "w") as fout:
                    fout.write("#!/bin/bash\n")
                    fout.write("#SBATCH --output=output_files/output_feature_imp_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                               text_method, cls_method))
                    fout.write("#SBATCH --mem=%s\n" % memory)
                    fout.write("#SBATCH --ntasks=4\n")
                    fout.write("#SBATCH --time=7-00\n")

                    fout.write("python %s %s" % (experiments_filename, params))

                time.sleep(1)
                subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_feature_imp_dc_single_laststate_bong_rf.sh
script_files/run_feature_imp_crm2_single_laststate_bong_rf.sh


# Performance (unstructured)

In [19]:
method_names = ["single_agg"]
text_method_encs = ["nb"]
datasets = ["crm2"]
cls_methods = ["rf"]
n_iter = 1

experiments_filename = "experiments_performance_with_unstructured_data.py"

In [20]:
for dataset_name in datasets:
    if "github" in dataset_name or "crm2" in dataset_name:
        memory = 50000
    else:
        memory = 25000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            for text_method in text_method_encs:
            
                params = " ".join([dataset_name, method_name, text_method, cls_method, str(n_iter)])
                script_file = "script_files/run_performance_%s_%s_%s_%s.sh" % (dataset_name, method_name, text_method, cls_method)
                print(script_file)
                with open(script_file, "w") as fout:
                    fout.write("#!/bin/bash\n")
                    fout.write("#SBATCH --output=output_files/output_performance_%s_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                               text_method, cls_method))
                    fout.write("#SBATCH --mem=%s\n" % memory)
                    fout.write("#SBATCH --ntasks=1\n")
                    fout.write("#SBATCH --time=7-00\n")

                    fout.write("python %s %s" % (experiments_filename, params))

                time.sleep(1)
                subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_performance_crm2_single_agg_nb_rf.sh


# Performance

In [21]:
method_names = ["prefix_index"]
datasets = ["crm2"]
cls_methods = ["rf", "xgboost", "logit"]
n_iter = 1

experiments_filename = "experiments_performance.py"

In [22]:
for dataset_name in datasets:
    if "github" in dataset_name or "crm2" in dataset_name:
        memory = 50000
    else:
        memory = 25000
        
    for cls_method in cls_methods:

        for method_name in method_names:
            
            params = " ".join([dataset_name, method_name, cls_method, str(n_iter)])
            script_file = "script_files/run_performance_%s_%s_%s.sh" % (dataset_name, method_name, cls_method)
            print(script_file)
            with open(script_file, "w") as fout:
                fout.write("#!/bin/bash\n")
                fout.write("#SBATCH --output=output_files/output_performance_%s_%s_%s.txt\n" % (dataset_name, method_name,
                                                                           cls_method))
                fout.write("#SBATCH --mem=%s\n" % memory)
                fout.write("#SBATCH --ntasks=1\n")
                fout.write("#SBATCH --time=7-00\n")

                fout.write("python %s %s" % (experiments_filename, params))

            time.sleep(1)
            subprocess.Popen(("sbatch %s" % script_file).split())



script_files/run_performance_crm2_prefix_index_rf.sh
script_files/run_performance_crm2_prefix_index_xgboost.sh
script_files/run_performance_crm2_prefix_index_logit.sh
