# Attribute importance ranking for synthetic data with known change points
The process change points are known. Hence, no change point detection algorithm needs to be used.

In [1]:
import pm4py
import helper
import pandas as pd

In [2]:
# data_file_path = 'data/synthetic/maardji et al 2013_xes_attributes/logs/ROI/ROI2.5k.xes'
# data_file_path = 'data/synthetic/maardji et al 2013_xes_attributes/logs/cb/cb10k.xes'
data_file_path = 'data/synthetic/generated/new_attribute_values/2500_sudden_5_00.xes'

## Load an event log

In [3]:
# Display the data info
data_info = helper.get_data_information(data_file_path)
data_info

change_points = data_info['change_points']

In [4]:
# load the event log into pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
event_log = xes_importer.apply(data_file_path)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|████████████████████████████████████████████| 2500/2500 [00:02<00:00, 946.31it/s]


In [5]:
from processdrift.framework import drift_detection
from processdrift.framework import drift_explanation
from processdrift.framework import feature_extraction
from processdrift.framework import pop_comparison
from processdrift.framework import windowing

In [6]:
# # build the primary process drift detector
# primary_feature_extractor = feature_extraction.RelationalEntropyFeatureExtractor()
# primary_window_generator = windowing.FixedSizeWindowGenerator(window_size=150, slide_by=5)
# primary_population_comparer = pop_comparison.KSTestPopComparer()
# primary_threshold = 0.5
# primary_process_drift_detector = drift_detection.DriftDetector(primary_feature_extractor, 
#                                                                primary_window_generator,
#                                                                primary_population_comparer, 
#                                                                threshold=primary_threshold)

In [7]:
# # build the primary process drift detector
# primary_feature_extractor = feature_extraction.RelationshipTypesCountFeatureExtractor('Relationship types count')
# primary_window_generator = windowing.FixedSizeWindowGenerator(window_size=150, slide_by=1)
# primary_population_comparer = pop_comparison.HotellingsTSquaredPopComparer()
# primary_threshold = 0.5
# primary_process_drift_detector = drift_detection.DriftDetector(primary_feature_extractor, 
#                                                                primary_window_generator,
#                                                                primary_population_comparer, 
#                                                                threshold=primary_threshold)

In [8]:
# build the primary process drift detector
primary_feature_extractor = feature_extraction.RunsFeatureExtractor()
primary_window_generator = windowing.FixedSizeWindowGenerator(window_size=100, slide_by=10)
primary_population_comparer = pop_comparison.ChiSquaredComparer()
primary_threshold = 0.5
primary_process_drift_detector = drift_detection.DriftDetector(primary_feature_extractor, 
                                                               primary_window_generator,
                                                               primary_population_comparer, 
                                                               threshold=primary_threshold)

primary_process_drift_detector = drift_detection.DriftDetectorTrueKnown(change_points)

In [9]:
# generate random window
import itertools
gen = primary_window_generator.get_windows(event_log)

index = 10
window_a, window_b = next(itertools.islice(gen, index, None))
print(window_a.start)
print(window_b.start)

110
210


In [10]:
features_window_a = primary_feature_extractor.extract(window_a.log)
features_window_b = primary_feature_extractor.extract(window_b.log)

In [11]:
comp_result = primary_population_comparer.compare(features_window_a, features_window_b)
comp_result

5.878447887534581e-06

In [12]:
preprocessed_a, preprocessed_b = primary_population_comparer._preprocess(features_window_a, features_window_b)

In [13]:
display(preprocessed_a)
display(preprocessed_b)

array([29, 19,  8,  7,  6,  6,  5,  4,  4,  4,  4,  1,  1,  1,  1])

array([27, 36,  2,  1,  2,  5,  9,  1,  3,  2,  9,  0,  0,  3,  0])

In [14]:
primary_population_comparer.compare(preprocessed_a, preprocessed_b)

1.4444852779215397e-08

In [15]:
# # generate features
# features_window_a = primary_feature_extractor.extract(window_a.log)
# display(features_window_a.head())
# features_window_b = primary_feature_extractor.extract(window_b.log)
# display(features_window_b.head())

In [16]:
# # perform test for features
# result = pop_comparison.HotellingsTSquaredPopComparer().compare(features_window_a, features_window_b)

In [17]:
# build the secondary process drift detectors
secondary_window_generator = primary_window_generator
secondary_population_comparer = pop_comparison.HellingerDistanceComparer()
secondary_threshold = 0.7
secondary_exclude_attributes = ['concept:name']
secondary_drift_detectors = drift_detection.get_all_attribute_drift_detectors(event_log,
                                                                         secondary_window_generator, 
                                                                         secondary_population_comparer, 
                                                                         threshold=secondary_threshold,
                                                                         exclude_attributes=secondary_exclude_attributes)

In [18]:
drift_explainer = drift_explanation.DriftExplainer(primary_process_drift_detector, secondary_drift_detectors)

In [19]:
# primary_and_secondary_change_series = drift_explainer.get_primary_and_secondary_change_series(event_log)
# drift_explanation.plot_primary_and_secondary_change_series(primary_and_secondary_change_series)

In [20]:
# primary, secondaries = primary_and_secondary_change_series

In [21]:
# drift_detection._get_change_points_from_series(secondaries['Attribute: relevant_attribute_4'], threshold=0.7, min_observations_below=3, min_distance_change_streaks=3)

In [23]:
change_point_explanations = drift_explainer.attribute_importance_per_primary_change_point(event_log, max_distance=100)
change_point_explanations

{250: [{'detector': 'Attribute: relevant_attribute_1',
   'detector_change_point': 228,
   'distance': -22}],
 500: [{'detector': 'Attribute: relevant_attribute_2',
   'detector_change_point': 508,
   'distance': 8}],
 750: [{'detector': 'Attribute: relevant_attribute_3',
   'detector_change_point': 708,
   'distance': -42}],
 1000: [],
 1250: [{'detector': 'Attribute: relevant_attribute_5',
   'detector_change_point': 1238,
   'distance': -12}],
 1500: [],
 1750: [],
 2000: [],
 2250: []}