# Import and Preprocess Data
Helper notebook to
1. unpack data
2. transform it into XES format
3. create a data dictionary, a catalog with all information about the data

In [1]:
import os
import opyenxes
from opyenxes.data_in import XMxmlParser
from opyenxes.data_out import XesXmlSerializer
import json
import helper
import zipfile
import shutil
import gzip

# Synthetic

### Maaradji et al. 2013 Dataset

72 event logs with different kinds of concept drift.

Instructions:
1. Download from https://data.4tu.nl/articles/dataset/Business_Process_Drift/12712436
2. Specify the file path to the zip file below.
3. Execute cells below.
4. The extracted files should be placed in d"ata/synthetic/maardji et al 2015_mxml"

This script does:
- unpack the data
- convert the data from MXML to XES
- create data dictionary entries
    - For each event log, a true list of all changepoints should be generated (every 10% of traces).

In [2]:
# change the from path to your download location
zip_path = 'data/synthetic/maaradji et al 2013.zip'

#### 1. Unpack the data

In [3]:
# unpacks the data into the correct folder
unzipped_path = 'data/synthetic/maardji et al 2013_mxml/'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzipped_path)

#### 2. Convert to XES

In [4]:
# set the from and to path
mxml_path = unzipped_path
xes_path = 'data/synthetic/maardji et al 2013_xes/'

In [5]:
# replicate the folder that has the MXML files but convert them into .XES files
def all_mxml_to_xes(mxml_path, xes_path):
    # get all mxml files from the path
    all_files = set()
    for (dirpath, dirnames, filenames) in os.walk(mxml_path):
        all_files.update([os.path.normpath(os.path.join(dirpath, file)) for file in filenames])
    all_mxml_files = set({file for file in all_files if file.lower().endswith('mxml')})
    all_none_mxml_files = all_files - all_mxml_files
    # print(all_none_mxml_files)
    
    # copy all none mxml files without changing them
    for none_mxml_file in all_none_mxml_files:
        new_path = helper.create_and_get_new_path(none_mxml_file, mxml_path, xes_path)
        
        # copy content 
        shutil.copy2(none_mxml_file, new_path)
    
    # change all mxml to xes
    xes_log_file_paths = []
    for mxml_log_path in all_mxml_files:
        # read the mxml log into OpyenXes
        mxml_parser = XMxmlParser.XMxmlParser()
        parsed_logs = None
        with open(mxml_log_path) as mxml_log_file:
            parsed_logs = mxml_parser.parse(mxml_log_file)
        
        # Our mxml files always only contain one log. Therefore, access this log
        parsed_log = parsed_logs[0]
        
        new_path = helper.create_and_get_new_path(mxml_log_path, mxml_path, xes_path, new_extension='.xes')
        print(new_path)
        
        # write the XES log out
        with open(new_path, 'w') as new_file:
            # save log back to XES file
            XesXmlSerializer.XesXmlSerializer().serialize(parsed_log, new_file)
        
        xes_log_file_paths.append(new_path)

    
    return xes_log_file_paths

In [6]:
xes_log_file_paths = all_mxml_to_xes(mxml_path, xes_path)

data\synthetic\maardji et al 2013_xes\logs\IOR\IOR7.5k.xes
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (30181.445556640625 msec.)

data\synthetic\maardji et al 2013_xes\logs\OIR\OIR7.5k.xes
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (24476.035400390625 msec.)

data\synthetic\maardji et al 2013_xes\logs\OIR\OIR5k.xes
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (12872.295654296875 msec.)

data\synthetic\maardji et al 2013_xes\logs\ROI\ROI2.5k.xes
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (5811.291015625 msec.)

data\synthetic\maardji et al 2013_xes\logs\fr\fr5k.xes
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (11419.871826171875 msec.)

data\synth

In [7]:
xes_log_file_paths

['data\\synthetic\\maardji et al 2013_xes\\logs\\IOR\\IOR7.5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\OIR\\OIR7.5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\OIR\\OIR5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\ROI\\ROI2.5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\fr\\fr5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\pm\\pm2.5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\cb\\cb10k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\ORI\\ORI7.5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\rp\\rp5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\cm\\cm5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\re\\re10k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\OIR\\OIR10k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\cd\\cd5k.xes',
 'data\\synthetic\\maardji et al 2013_xes\\logs\\IOR\\IOR10k.xes',
 'data\\synt

#### 3. Create Data Dictionary

In [8]:
# create a data dictionary that has a list of changepoint locations for each event log
data_dictionary = {}
for log_file_path in xes_log_file_paths:
    log_info = {}
    log_info['file_path'] = log_file_path
    
    # get the cleaned file name
    file_name =  os.path.basename(log_file_path)
    # remove the suffix
    file_name = os.path.splitext(file_name)[0]
    log_info['file_name'] = file_name
    
    log_info['drift_type'] = 'sudden'
    log_info['dataset'] = 'maardji et al 2013'
    log_info['is_synthetic'] = True
    log_info['has_generated_attributes'] = False
    
    size = None
    if '2.5k' in file_name:
        size = 2500
    elif '7.5k' in file_name:
        size = 7500
    elif '10k' in file_name:
        size = 10000
    elif '5k' in file_name: # This statement has to come last so that there is no confusion with the other names that have a 5 in them.
        size = 5000
    log_info['size'] = size
    
    change_points = [(int) (size * (i+1)/10) for i in range(9)]
    log_info['change_points'] = change_points
    
    # TODO Could implement the change pattern name here. For now, users can looks this up through the file_name and paper by Maardji et al. 2013
    data_dictionary[log_file_path] = log_info

In [9]:
# save the data dictionary
helper.update_data_dictionary(data_dictionary)

## 2013 BPI Challenge Dataset
TODO: Insert instructions, add to data dictionary

In [10]:
# unpack the 2013 bpi challenge dataset
path = 'data/real/bpi_challenge_2013_incidents.xes.gz'

with gzip.open(path, 'rb') as f_in:
    new_path = path[:-3]
    with open(new_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)