# Preprocess Data

In [58]:
import os
import opyenxes
from opyenxes.data_in import XMxmlParser
from opyenxes.data_out import XesXmlSerializer

## Maaradji et al. 2013 Dataset
- The dataset needs to be converted to XES for importing into pm4py
- For each event log, a true list of all changepoints should be generated (every 10% of traces).
- Simplify the folder structure in only a list of XES event logs plus their according changepoints.
- Add artificial feature data that can explain the concept drift.

In [59]:
# set the from and to path
from_path = 'data/synthetic/maardji et al 2013_mxml/'
to_path = 'data/synthetic/maardji et al 2013_xes/'

In [60]:
# get all mxml event log files from the from_path
list_of_files = list()
for (dirpath, dirnames, filenames) in os.walk(from_path):
    list_of_files += [os.path.join(dirpath, file) for file in filenames]

list_of_mxml_log_paths = [file for file in list_of_files if file.lower().endswith('mxml')]

In [None]:
# convert each log to XES and save it in the to path
to_path_logs = os.path.join(to_path, 'logs')
    
# create to_path if not existing
if not os.path.exists(to_path):
    # Create a new directory because it does not exist 
    os.makedirs(to_path)

xes_log_file_paths = []
for mxml_log_path in list_of_mxml_log_paths:
    # read the mxml log into OpyenXes
    mxml_parser = XMxmlParser.XMxmlParser()
    parsed_logs = None
    with open(mxml_log_path) as mxml_log_file:
        parsed_logs = mxml_parser.parse(mxml_log_file)
    
    # Our mxml files always only contain one log. Therefore, access this log
    parsed_log = parsed_logs[0]
    
    # get the output path
    file_name = os.path.basename(mxml_log_path)
    print(file_name)
    # remove the suffix
    file_name = os.path.splitext(file_name)[0]
    # append .xes
    file_name = file_name + '.xes'
    
    to_file_path = os.path.join(to_path_logs, file_name)
    xes_log_file_paths.append(to_file_path)
    
    # write the XES log out
    with open(to_file_path, 'w') as to_file:
        # save log back to XES file
        XesXmlSerializer.XesXmlSerializer().serialize(parsed_log, to_file)

cb10k.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (23797.61181640625 msec.)

cb2.5k.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (8428.402587890625 msec.)

cb5k.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (13311.8203125 msec.)

cb7.5k.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (20389.671875 msec.)

cb7.5k-checkpoint.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (23568.2236328125 msec.)

cd10k.mxml
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (29876.697998046875 msec.)

cd2.5k.MXML
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG

In [None]:
# create a data dictionary that has a list of changepoint locations for each event log
data_dictionary = {}
for log_file_path in xes_log_file_paths:
    log_info = {}
    log_info['file_path'] = log_file_path
    
    # get the cleaned file name
    file_name =  os.path.basename(log_file_path)
    # remove the suffix
    file_name = os.path.splitext(file_name)[0]
    log_info['file_name'] = file_name
    
    log_info['drift_type'] = 'sudden'
    log_info['dataset'] = 'maardji et al 2013'
    
    size = None
    if '2.5k' in file_name:
        size = 2500
    elif '7.5k' in file_name:
        size = 7500
    elif '10k' in file_name:
        size = 10000
    elif '5k' in file_name: # This statement has to come last so that there is no confusion with the other names that have a 5 in them.
        size = 5000
    log_info['size'] = size
    
    changepoints = [(int) (size * (i+1)/10) for i in range(9)]
    log_info['changepoints'] = changepoints
    
    # TODO Could implement the change pattern name here. For now, users can looks this up through the file_name and paper by Maardji et al. 2013
    data_dictionary[log_file_path] = log_info