# Preprocess Data

In [58]:
import os
import opyenxes
from opyenxes.data_in import XMxmlParser
from opyenxes.data_out import XesXmlSerializer
import json
import helper

## Maaradji et al. 2013 Dataset
- The dataset needs to be converted to XES for importing into pm4py
- For each event log, a true list of all changepoints should be generated (every 10% of traces).
- Simplify the folder structure in only a list of XES event logs plus their according changepoints.
- Add artificial feature data that can explain the concept drift.

In [87]:
# set the from and to path
from_path = 'data/synthetic/maardji et al 2013_mxml/'
to_path = 'data/synthetic/maardji et al 2013_xes/'

### Convert to XES

In [60]:
# get all mxml event log files from the from_path
list_of_files = list()
for (dirpath, dirnames, filenames) in os.walk(from_path):
    list_of_files += [os.path.join(dirpath, file) for file in filenames]

list_of_mxml_log_paths = [file for file in list_of_files if file.lower().endswith('mxml')]

In [63]:
# convert each log to XES and save it in the to path
to_path_logs = os.path.join(to_path, 'logs')
    
# create to_path if not existing
if not os.path.exists(to_path):
    # Create a new directory because it does not exist 
    os.makedirs(to_path)

xes_log_file_paths = []
for mxml_log_path in list_of_mxml_log_paths:
    # read the mxml log into OpyenXes
    mxml_parser = XMxmlParser.XMxmlParser()
    parsed_logs = None
    with open(mxml_log_path) as mxml_log_file:
        parsed_logs = mxml_parser.parse(mxml_log_file)
    
    # Our mxml files always only contain one log. Therefore, access this log
    parsed_log = parsed_logs[0]
    
    # get the output path
    file_name = os.path.basename(mxml_log_path)
    # remove the suffix
    file_name = os.path.splitext(file_name)[0]
    # append .xes
    file_name = file_name + '.xes'
    
    to_file_path = os.path.normpath(os.path.join(to_path_logs, file_name))
    xes_log_file_paths.append(to_file_path)
    
    # write the XES log out
    with open(to_file_path, 'w') as to_file:
        # save log back to XES file
        XesXmlSerializer.XesXmlSerializer().serialize(parsed_log, to_file)

cb10k.mxml
cb2.5k.mxml
cb5k.mxml
cb7.5k.mxml
cb7.5k-checkpoint.mxml
cd10k.mxml
cd2.5k.MXML
cd5k.MXML
cd7.5k.MXML
cf10k.mxml
cf2.5k.mxml
cf5k.mxml
cf7.5k.mxml
cm10k.mxml
cm2.5k.mxml
cm5k.mxml
cm7.5k.mxml
cp10k.mxml
cp2.5k.mxml
cp5k.mxml
cp7.5k.mxml
fr10k.MXML
fr2.5k.MXML
fr5k.MXML
fr7.5k.MXML
IOR10k.MXML
IOR2.5k.mxml
IOR5k.MXML
IOR7.5k.mxml
IRO10k.MXML
IRO2.5k.mxml
IRO5k.MXML
IRO7.5k.mxml
lp10k.mxml
lp2.5k.MXML
lp5k.mxml
lp7.5k.mxml
OIR10k.MXML
OIR2.5k.MXML
OIR5k.MXML
OIR7.5k.MXML
ORI10k.MXML
ORI2.5k.MXML
ORI5k.MXML
ORI7.5k.MXML
pl10k.mxml
pl2.5k.mxml
pl5k.mxml
pl7.5k.mxml
pm10k.mxml
pm2.5k.MXML
pm5k.mxml
pm7.5k.MXML
re10k.mxml
re2.5k.mxml
re5k.mxml
re7.5k.mxml
RIO10k.MXML
RIO2.5k.MXML
RIO5k.MXML
RIO7.5k.MXML
ROI10k.MXML
ROI2.5k.MXML
ROI5k.MXML
ROI7.5k.MXML
rp10k.mxml
rp2.5k.MXML
rp5k.mxml
rp7.5k.MXML
sw10k.mxml
sw2.5k.MXML
sw5k.mxml
sw7.5k.MXML


In [69]:
for index, info in data_dictionary.items():
    data_dictionary[index]['file_path'] = os.path.normpath(info['file_path'])

In [93]:
# create a data dictionary that has a list of changepoint locations for each event log
data_dictionary = {}
for log_file_path in xes_log_file_paths:
    log_info = {}
    log_info['file_path'] = log_file_path
    
    # get the cleaned file name
    file_name =  os.path.basename(log_file_path)
    # remove the suffix
    file_name = os.path.splitext(file_name)[0]
    log_info['file_name'] = file_name
    
    log_info['drift_type'] = 'sudden'
    log_info['dataset'] = 'maardji et al 2013'
    log_info['is_synthetic'] = True
    log_info['has_generated_attributes'] = False
    
    size = None
    if '2.5k' in file_name:
        size = 2500
    elif '7.5k' in file_name:
        size = 7500
    elif '10k' in file_name:
        size = 10000
    elif '5k' in file_name: # This statement has to come last so that there is no confusion with the other names that have a 5 in them.
        size = 5000
    log_info['size'] = size
    
    changepoints = [(int) (size * (i+1)/10) for i in range(9)]
    log_info['changepoints'] = changepoints
    
    # TODO Could implement the change pattern name here. For now, users can looks this up through the file_name and paper by Maardji et al. 2013
    data_dictionary[log_file_path] = log_info
    
data_dictionary

{'data/synthetic/maardji et al 2013_xes/logs\\cb10k.xes': {'file_path': 'data/synthetic/maardji et al 2013_xes/logs\\cb10k.xes',
  'file_name': 'cb10k',
  'drift_type': 'sudden',
  'dataset': 'maardji et al 2013',
  'is_synthetic': True,
  'has_generated_attributes': False,
  'size': 10000,
  'changepoints': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]},
 'data/synthetic/maardji et al 2013_xes/logs\\cb2.5k.xes': {'file_path': 'data/synthetic/maardji et al 2013_xes/logs\\cb2.5k.xes',
  'file_name': 'cb2.5k',
  'drift_type': 'sudden',
  'dataset': 'maardji et al 2013',
  'is_synthetic': True,
  'has_generated_attributes': False,
  'size': 2500,
  'changepoints': [250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250]},
 'data/synthetic/maardji et al 2013_xes/logs\\cb5k.xes': {'file_path': 'data/synthetic/maardji et al 2013_xes/logs\\cb5k.xes',
  'file_name': 'cb5k',
  'drift_type': 'sudden',
  'dataset': 'maardji et al 2013',
  'is_synthetic': True,
  'has_generated_attributes': Fa

In [101]:
# save data_dictionary
helper.update_data_dictionary(data_dictionary)

In [86]:
# read an event log with opyenxes
file_path = 'data\\synthetic\\maardji et al 2013_xes\\logs\\cb2.5k.xes'
data_info = data_dictionary[file_path]

In [None]:
opyenxes_log = 

In [78]:
class AttributeSimulator:
    """Simulates attributes for the given event log."""
    def __init__(self, log, changepoints):
        """Create a new attribute simulator given an event log
        
        Args:
            log: pm2py Eventlog # TODO pm2py or opyenxes?
        """
        self.log = log
        self.first_trace_index = 0
        self.last_trace_index = len(log) - 1
        
        self.first_trace_timestamp = log[0][0]['time:timestamp'].replace(tzinfo=None)
        self.last_trace_timestamp = log[-1][-1]['time:timestamp'].replace(tzinfo=None)

In [None]:
# we start with the implemementation of one type of attribute behavior
drift_type = 'sudden'
attribute_type = 'categorical'
number_categorical_values = 5
attribute_level = 'trace'
noise_level = 'low'
concept_change = 'categorical_oversampling'
data_stationariy = 'stationary'