### Insert attribute change
The following attribute behavior can be combined freely
1. Types of Drift (Lu et al. 2019)
    1. "*Sudden drift*: A new concept occurs within a short time"
    2. "*Gradual drift*: A new concept replaces on old one over a period of time"
    3. "*Incremental drift*: An old concept incrementally changes to a new concept over a period of time".
    4. "*Reoccurring Concepts*: An old concept may reoccur after some time"
2. Attribute data type (Optional: add Ordinal)
    1. Categorical (nominal)
    2. Continuous
3. Attribute level
    1. Trace
    2. Event
4. Noise level
    1. None (0%)
    2. Low (10%)
    3. Medium (25%)
    4. Strong (50%)
5. Concept change
    1. Missing data
    2. Completely new distribution
    3. For categorical data: Oversampling of one class
    4. For categorical data: Undersampling of one class
    5. For continuous data: Increase of mean
    6. For continuous data: Decrease of mean
6. Data Stationarity
    1. Strong decrease
    2. Weak decrease
    3. Stationary
    4. Weak increase
    5. Strong increase
7. Location of Attribute Change
    1. Normally distributed before changepoint

In [81]:
import os
import json
import helper
from opyenxes.data_in import XesXmlParser
from opyenxes.data_out import XesXmlSerializer
from opyenxes.model import XAttributeLiteral
import numpy as np

In [6]:
# select a dataset to augment
datasets = helper.get_datasets_by_criteria(is_synthetic=True, size=2500)
dataset = next(iter(datasets))
dataset_info = datasets[dataset]

In [166]:
def _opyenxes_read_xes(data_file_path, multiple_logs=False):
    """Reads an XES event log with opyenxes.
    
    Args:
        data_file_path: Path to data file.
        multiple_logs: Set to true if the XES file contains multiple logs.
    
    Return:
        opyenxes event log(s)
    """
    opyenxes_xes_parser = XesXmlParser.XesXmlParser()
    
    with open(data_file_path) as data_file:
        opyenxes_logs = opyenxes_xes_parser.parse(data_file)
    
    if multiple_logs:
        return opyenxes_logs
    else:
        return opyenxes_logs[0]

In [179]:
def _opyenxes_write_xes(log, data_file_path):
    """Writes an XES event log with opyenxes.
    
    Args:
        data_file_path: Path to data file.
         
    """
    opyenxes_xes_serializer = XesXmlSerializer.XesXmlSerializer()
    
    opyenxes_logs = None
    with open(data_file_path, 'w') as data_file:
        opyenxes_xes_serializer.serialize(log, data_file)

In [20]:
# read the event log
opyenxes_log = _opyenxes_read_xes(dataset_info['file_path'])

In [153]:
class AttributeGenerator:
    """Simulates attributes for a given opyenxes event log"""
    def __init__(self, opyenxes_log, change_points, timestamp_field = 'time:timestamp'):
        """Initialize an attribute generator which can be used to generate multiple attributes for a given event log.
        
        Args:
            opyenxes_log: opyenxes event log
            change_points: List of change points
            timestamp filed: Field name of event timestamp field
        """
        self.opyenxes_log = opyenxes_log
        self.change_points = change_points
        
        # get start and end trace
        self.start_trace_id = 0
        self.end_trace_id = len(opyenxes_log) - 1
        
        # get start and end timestamp
        self.start_time = opyenxes_log[0][0].get_attributes()['time:timestamp'].get_value()
        
        #iterate through all traces to find the last timestamp
        self.end_time = opyenxes_log[-1][-1].get_attributes()['time:timestamp'].get_value()
        for trace in reversed(opyenxes_log):
            trace_end_time = trace[-1].get_attributes()['time:timestamp'].get_value()
            if trace_end_time > self.end_time:
                self.end_time = trace_end_time
    
    def _generate_attribute(self):
        pass
    
    def generate_continuous_attribute(self, attribute_name, min_value, max_value):
        pass
    
    def generate_categorical_attribute(self, attribute_name, count_attribute_values=3,
                                       number_changes=1, change_location_standard_deviation = 10,
                                       drift_type='sudden', attribute_level='trace', 
                                       noise_level='none', concept_change='oversampling', 
                                       data_stationarity='stationary', location_attribute_change='normal'):
        # TODO implement a way that only select changepoints can be explained by attribute drift
        if number_changes == None: number_changes = len(self.change_points)
        
        attribute_value_candidates = [f'value_{attribute_number + 1}' for attribute_number in range(count_attribute_values)]
        
        # assing a static percentage of category occurences (the baseline)
        # use the Dirichlet distribution
        baseline_probabilities = np.random.dirichlet(np.ones(count_attribute_values),size=1)[0]
        
        # get the baseline data
        baseline_attribute_values = [] # one entry for each trace or event
        for trace in opyenxes_log:
            attribute_value = np.random.choice(attribute_value_candidates, 1, p=baseline_probabilities)[0]
            baseline_attribute_values.append(attribute_value)
        
        attribute_values = baseline_attribute_values # attribute values start with just the baseline. The drift is overlayed.
        
        # introduce drift
        explainable_changepoints = np.random.choice(self.change_points, number_changes)
        explainable_changepoints.sort()
        
        print(self.change_points)
        change_points_explained = {change_point: {} for change_point in self.change_points}
        for change_point in explainable_changepoints:
            # select a attribute change point
            attribute_change_point_deviation = - np.abs(np.random.normal(0, change_location_standard_deviation, 1))[0]
            attribute_change_point = int(change_point - attribute_change_point_deviation)
            if attribute_change_point < 0: # handle the case that we try to get a non-existing trace
                attribute_change_point == 0
            
            change_points_explained[change_point]['attribute_change_point'] = attribute_change_point
            change_points_explained[change_point]['explained_by_attribute'] = 'attribute_name'    
            
            # start drift at change point # TODO implement other drift variants
            changed_probabilities = np.random.dirichlet(np.ones(count_attribute_values),size=1)[0]
            # get the changed attribute data
            changed_attribute_values = [] # one entry for each trace or event
            for trace in opyenxes_log:
                attribute_value = np.random.choice(attribute_value_candidates, 1, p=changed_probabilities)[0]
                changed_attribute_values.append(attribute_value)
            
            # place the changed attribute data starting from the changepoint
            attribute_values[change_point:] = changed_attribute_values[change_point:]
        
        # write attribute values into the event log
        # TODO implement for event attributes
        
        for trace, attribute_value in zip(self.opyenxes_log, attribute_values):
            # build the new attribute
            attribute = XAttributeLiteral.XAttributeLiteral(key=attribute_name, value=attribute_value)
            
            # add the new attribute to the existing trace attributes dictionary
            trace_attributes = trace.get_attributes()
            trace_attributes[attribute_name] = attribute
            
            # update the trace attribute dictionary
            trace.set_attributes(trace_attributes)
    
        # return the change point explanations
        return explainable_changepoints

In [154]:
ag = AttributeGenerator(opyenxes_log, dataset_info['changepoints'])

In [155]:
explainable_changepoints = ag.generate_categorical_attribute('supplier')
pass

[250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250]


In [159]:
ag.opyenxes_log[0].get_attributes()

{'concept:name': <opyenxes.model.XAttributeLiteral.XAttributeLiteral at 0x2509ed560b0>,
 'supplier': <opyenxes.model.XAttributeLiteral.XAttributeLiteral at 0x250982a3a00>}

In [177]:
ag.opyenxes_log[50].get_attributes()['supplier'].get_value()

'value_2'

In [180]:
_opyenxes_write_xes(ag.opyenxes_log, 'altered_log.xes')

Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (5538.06689453125 msec.)



In [165]:
# read xes into opyenxes
opyenxes_xes_parser = XesXmlParser.XesXmlParser()
opyenxes_logs = opyenxes_xes_parser.parse(open(xes_file_path))
opyenxes_log = opyenxes_logs[0]

NameError: name 'xes_file_path' is not defined

In [23]:
# get a trace from the log
trace = opyenxes_log[0]

In [26]:
# get attributes for this trace
trace_attributes = trace.get_attributes()

# get a specific attribute value
trace_attributes['concept:name'].get_value()

'61'

In [30]:
# add an attribute to the trace's attributes

# create the new trace attribute
from opyenxes.model import XAttributeLiteral
key = 'test_attribute'
attribute = XAttributeLiteral.XAttributeLiteral(key=key, value='test_value')

trace_attributes[key] = attribute

In [33]:
trace_attributes

{'concept:name': <opyenxes.model.XAttributeLiteral.XAttributeLiteral at 0x1c97a8ce1d0>,
 'test_attribute': <opyenxes.model.XAttributeLiteral.XAttributeLiteral at 0x1c902712da0>}

In [34]:
trace.set_attributes(trace_attributes)

In [35]:
# set a trace attribute
opyenxes_log[0].get_attributes()[key].get_value()

'test_value'

In [39]:
# write out event log
xes_xml_serializer = XesXmlSerializer.XesXmlSerializer()

In [43]:
data_out_path = 'data/synthetic/maardji et al 2013_xes/augmented.xes'
# write the XES log out
with open(data_out_path, 'w') as to_file:
    # save log back to XES file
    xes_xml_serializer.serialize(opyenxes_log[0], to_file)

Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (5501.7255859375 msec.)

