# Secondary Drift Detection for Simple Configuration

## Setup
### Imports

In [1]:
# imports
import helper
from pm4py.objects.log.importer.xes import importer as xes_importer

from processdrift.framework import drift_detection
from processdrift.framework import drift_explanation
from processdrift.framework import feature_extraction
from processdrift.framework import pop_comparison
from processdrift.framework import windowing
from processdrift.framework import evaluation

import time
import os

### Data settings and event log loading

In [2]:
input_path = r'data\synthetic\attribute_drift\complex\10000'
output_path = r'results\synthetic\attribute_drift\complex\10000'
size = 10000
number_relevant_attributes = 5

results_path = r'results\complex\10000\100_iter_results.csv'
delete_if_results_exists = False

window_generator_types = ['fixed', 'adaptive']
window_sizes= [100, 200]
population_comparers = [pop_comparison.KSTestPopComparer(), pop_comparison.ChiSquaredComparer()] # pop_comparison.HellingerDistanceComparer()]# pop_comparison.KSTestPopComparer(), pop_comparison.ChiSquaredComparer()]
thresholds = [0.05]
max_distances = [200]
slide_bys = [5, 10]

# always exclude the concept name as an attribute
exclude_attributes = ['concept:name']

In [3]:
# get the true change points and true change point explanations
true_change_points = helper.get_change_points_maardji_et_al_2013(10000)
true_change_point_explanations = [(true_change_points[i], f'relevant_attribute_{i+1:02d}') for i in range(number_relevant_attributes)]

In [4]:
# load all event logs from the input path
event_log_file_paths = helper.get_all_files_in_dir(input_path, include_files_in_subdirs=True)

In [5]:
# primary drift detector stays always the same
primary_process_drift_detector = drift_detection.DriftDetectorTrueKnown(true_change_points)

In [6]:
# build all possible configuration:
configurations = []
for window_generator_type in window_generator_types:
    for window_size in window_sizes:
        for population_comparer in population_comparers:
            for threshold in thresholds:
                for max_distance in max_distances:
                    for slide_by in slide_bys:
                        configurations.append({
                            'window_generator_type': window_generator_type,
                            'window_size': window_size,
                            'population_comparer': population_comparer,
                            'threshold': threshold,
                            'max_distance': max_distance,
                            'slide_by': slide_by
                        })

In [7]:
# delete results file if exists
if delete_if_results_exists:
    if os.path.exists(results_path):
        os.remove(results_path)

In [8]:
# iterate all datasets with all settings
for i, event_log_file_path in enumerate(event_log_file_paths):
    
    print(f'Event log {i}')
    event_log = xes_importer.apply(event_log_file_path)

    for configuration in configurations:
        print(f'\nEvaluating configuration {configuration}')
        
        start_time = time.time()

        window_generator_type = configuration['window_generator_type']
        window_size = configuration['window_size']
        population_comparer = configuration['population_comparer']
        threshold = configuration['threshold']
        max_distance = configuration['max_distance']
        slide_by = configuration['slide_by']
        
        window_generator = None
        # build the secondary drift detector
        if window_generator_type == 'fixed':
            window_generator = windowing.FixedSizeWindowGenerator(window_size, slide_by=slide_by)
        elif window_generator_type == 'adaptive':
            window_generator = windowing.AdaptiveWindowGenerator(window_size, slide_by=slide_by)

        secondary_drift_detectors = drift_detection.get_all_attribute_drift_detectors(event_log,
                                                                            window_generator, 
                                                                            population_comparer,
                                                                            level='trace', 
                                                                            threshold=threshold,
                                                                            exclude_attributes=exclude_attributes)

        drift_explainer = drift_explanation.DriftExplainer(primary_process_drift_detector, secondary_drift_detectors)

        # calculate the drift explanations
        observed_changes = drift_explainer.get_primary_and_secondary_changes(event_log, max_distance)
        observed_drift_point_explanations = drift_explanation.attribute_importance_per_primary_change_point(observed_changes, max_distance)
        
        # evaluate the change point explanations
        observed_drift_point_explanations_simple =  helper.get_simple_change_point_list_from_explainer(observed_drift_point_explanations)    

        result = evaluation.evaluate_explanations(true_change_point_explanations, observed_drift_point_explanations_simple, max_distance=window_size)
        
        # get end time
        end_time = time.time()
        # get the compute time and write into results
        compute_time = end_time - start_time
        
        # write the configuration results to file
        helper.append_config_results(results_path, event_log_file_path, configuration, result, compute_time)

Event log 0


  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 10000/10000 [00:08<00:00, 1245.99it/s]



Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 100, 'population_comparer': KSTestPopComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 5}

Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 100, 'population_comparer': KSTestPopComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 10}

Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 100, 'population_comparer': ChiSquaredComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 5}

Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 100, 'population_comparer': ChiSquaredComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 10}

Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 200, 'population_comparer': KSTestPopComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 5}

Evaluating configuration {'window_generator_type': 'fixed', 'window_size': 200, 'population_comparer': KS

KeyboardInterrupt: 

The runtime far exceeds that of previous runs (ca. 1000 seconds = 16 Minutes per run). The only cause of difference can be that there are more attribute values. Next, we perform a more detailed runtime analysis.

In [22]:
configuration = configurations[0]
print(configuration)

{'window_generator_type': 'fixed', 'window_size': 100, 'population_comparer': KSTestPopComparer, 'threshold': 0.05, 'max_distance': 200, 'slide_by': 5}


In [15]:
# build the change point explainer
window_generator_type = configuration['window_generator_type']
window_size = configuration['window_size']
population_comparer = configuration['population_comparer']
threshold = configuration['threshold']
max_distance = configuration['max_distance']
slide_by = configuration['slide_by']

window_generator = None
# build the secondary drift detector
if window_generator_type == 'fixed':
    window_generator = windowing.FixedSizeWindowGenerator(window_size, slide_by=slide_by)
elif window_generator_type == 'adaptive':
    window_generator = windowing.AdaptiveWindowGenerator(window_size, slide_by=slide_by)

secondary_drift_detectors = drift_detection.get_all_attribute_drift_detectors(event_log,
                                                                    window_generator, 
                                                                    population_comparer,
                                                                    level='trace', 
                                                                    threshold=threshold,
                                                                    exclude_attributes=exclude_attributes)

primary_process_drift_detector = drift_detection.DriftDetectorTrueKnown(helper.get_change_points_maardji_et_al_2013(2500))

drift_explainer = drift_explanation.DriftExplainer(primary_process_drift_detector, secondary_drift_detectors)

In [16]:
import cProfile

In [17]:
# set the event log to some small log
event_log = xes_importer.apply(r'data\synthetic\attribute_drift\simple\2500\cb2.5k_02e43623-eed0-4254-81c4-2046faecccba.xes')

parsing log, completed traces :: 100%|██████████| 2500/2500 [00:02<00:00, 1010.90it/s]


In [30]:
# calculate the drift explanations
import pstats

profiler = cProfile.Profile()
stats = profiler.run("drift_explainer.get_primary_and_secondary_changes(event_log, max_distance)")
pstats.Stats(stats).sort_stats(2).print_stats()
# observed_drift_point_explanations = drift_explanation.attribute_importance_per_primary_change_point(observed_changes, max_distance)

         71077508 function calls (71054545 primitive calls) in 125.357 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  125.357  125.357 {built-in method builtins.exec}
        1    0.000    0.000  125.357  125.357 <string>:1(<module>)
        1    0.000    0.000  125.357  125.357 c:\Users\Lennart\Documents\1_Workspaces\explaining_concept_drift\processdrift\framework\drift_explanation.py:24(get_primary_and_secondary_changes)
       10    0.000    0.000  125.352   12.535 c:\Users\Lennart\Documents\1_Workspaces\explaining_concept_drift\processdrift\framework\drift_detection.py:44(get_changes)
       10    0.197    0.020  125.304   12.530 c:\Users\Lennart\Documents\1_Workspaces\explaining_concept_drift\processdrift\framework\drift_detection.py:71(_get_change_series)
     4620    0.147    0.000   90.347    0.020 c:\Users\Lennart\Documents\1_Workspaces\explaining_concept_drift\processdrift\framework

<pstats.Stats at 0x21c6371a9b0>

In [28]:
start = time.time()
result = drift_explainer.get_primary_and_secondary_changes(event_log, max_distance)
duration = time.time() - start
print(duration)

108.62430238723755
