# Get change point explanations for all event logs in a folder
The process change points are known. Hence, no primary change point detection algorithm needs to be used.

In [1]:
import helper
import pandas as pd
from processdrift.framework import drift_detection
from processdrift.framework import drift_explanation
from processdrift.framework import feature_extraction
from processdrift.framework import pop_comparison
from processdrift.framework import windowing
from processdrift.framework import evaluation
from pm4py.objects.log.importer.xes import importer as xes_importer

In [2]:
datasets = helper.get_datasets_by_criteria(in_folder='data/synthetic/attribute_drift/simple/5000/')

Get explanations under different configurations:

## 1. Fixed window, chi-square
- windowing: fixed
- window_size: [50, 100, 150]
- population_comparer: Chi-Square
- secondary threshold: 0.05
- max_distance: window_size

In [12]:
window_sizes = [50, 100, 150]
SecondaryWindowGeneratorClass = windowing.FixedSizeWindowGenerator
secondary_population_comparer = pop_comparison.ChiSquaredComparer()
secondary_threshold = 0.5
secondary_exclude_attributes = ['concept:name']
max_distance = None

In [18]:
def run_experiments(change_point_explainer, datasets):
    intermediate_results = []
    for dataset in datasets.values():
        data_file_path = dataset['file_path']

        # load the event log into pm4py
        event_log = xes_importer.apply(data_file_path)

        # calculate the drift explanations
        observed_change_point_explanations = drift_explainer.attribute_importance_per_primary_change_point(event_log, max_distance=max_distance)

        # convert change point explanations into simple format
        simple_true_change_point_explanations = helper.get_simple_change_point_format_from_data_info(dataset)
        simple_observed_change_point_explanations =  helper.get_simple_change_point_list_from_explainer(observed_change_point_explanations)    
        
        # set the maximum distance for the evaluation
        max_distance_evaluation = change_point_explainer.max_distance

        result = evaluation.evaluate_explanations(simple_true_change_point_explanations, simple_observed_change_point_explanations, max_distance=max_distance_evaluation)
        intermediate_results.append(result)
        
    aggregated_results = evaluation.aggregate_cp_explanation_results(results_list)
    
    return aggregated_results

In [None]:
for window_size in window_size:
    
    break

In [9]:
results_lists = {}
for i, dataset in enumerate(datasets.values()):
    print(f'Working on dataset {i+1}')

    for window_size in window_sizes:
        if window_size not in results_lists:
            results_lists[window_size] = []

        if max_distance == None:
            max_distance = window_size
        
        secondary_window_generator = SecondaryWindowGeneratorClass(window_size)
        
        data_file_path = dataset['file_path']
        change_points = dataset['change_points']

        true_change_point_explanations =  dataset['change_point_explanations']

        # load the event log into pm4py
        event_log = xes_importer.apply(data_file_path)
        primary_process_drift_detector = drift_detection.DriftDetectorTrueKnown(change_points)
        secondary_drift_detectors = drift_detection.get_all_attribute_drift_detectors(event_log,
                                                                            secondary_window_generator, 
                                                                            secondary_population_comparer, 
                                                                            threshold=secondary_threshold,
                                                                            exclude_attributes=secondary_exclude_attributes)
        drift_explainer = drift_explanation.DriftExplainer(primary_process_drift_detector, secondary_drift_detectors)
        
        # calculate the drift explanations
        observed_change_point_explanations = drift_explainer.attribute_importance_per_primary_change_point(event_log, max_distance=max_distance)

        simple_true = helper.get_simple_change_point_format_from_data_info(dataset)
        simple_detected =  helper.get_simple_change_point_list_from_explainer(observed_change_point_explanations)    
        
        result = evaluation.evaluate_explanations(simple_true, simple_detected, max_distance=max_distance)
        results_lists[window_size].append(result)

Working on dataset 1


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1532.80it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1247.16it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 911.12it/s] 


Working on dataset 2


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1136.89it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 870.32it/s] 
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1279.93it/s]


Working on dataset 3


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1133.31it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1564.55it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1147.77it/s]


Working on dataset 4


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 937.97it/s] 
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1313.69it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1241.85it/s]


Working on dataset 5


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1209.02it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1023.61it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1185.55it/s]


Working on dataset 6


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1088.26it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1265.89it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1036.88it/s]


Working on dataset 7


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1201.38it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1345.10it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1139.85it/s]


Working on dataset 8


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1112.24it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1382.10it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1427.92it/s]


Working on dataset 9


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1446.19it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1291.42it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1331.99it/s]


Working on dataset 10


parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1270.76it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1226.12it/s]
parsing log, completed traces :: 100%|██████████| 2500/2500 [00:01<00:00, 1474.64it/s]


In [10]:
# get the average results per window size
average_results_per_window_size = {}

for window_size, results_list in results_lists.items():
    average_results = evaluation.aggregate_cp_explanation_results(results_list)
    average_results_per_window_size[window_size] = average_results

In [15]:
import pandas as pd
results_df = pd.DataFrame().from_dict(average_results_per_window_size, orient='index')
results_df = results_df.drop(columns=['lags'])

In [16]:
results_df

Unnamed: 0,number_of_correct_detections,number_of_true_changes,number_of_detections,precision,recall,f1_score,mean_lag
50,45,50,1439,0.031272,0.9,0.060443,24.844444
100,25,50,986,0.025355,0.5,0.048263,7.2
150,25,50,781,0.03201,0.5,0.060168,15.92
