In [None]:
import os
import re
import gzip
import glob
import json
import shutil
import zipfile
import pandas as pd
import urllib.request
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

dataset_dir = '../datasets/'

# make directory if not exists
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# datasets are listed in datasets.json
with open(dataset_dir + 'datasets.json') as f:
  datasets = json.load(f)

# download the datasets
for dataset in datasets:
    full_path = dataset_dir + dataset['filename']
    # if the dataset has been downloaded, do nothing
    if os.path.exists(full_path):
        print(dataset['name'], 'has been already downloaded.')
    else:
        print('Downloading', dataset['name'])
        # obtain a dataset
        urllib.request.urlretrieve(dataset['url'], full_path)
        # unzip it if it's a zip file
        if re.match(r'.*\.zip$', dataset['filename']):
            print('Decompressing', dataset['name'])
            with zipfile.ZipFile(full_path) as zip_ref:
                zip_ref.extractall(dataset_dir)
        elif re.match(r'.*\.gz$', dataset['filename']):
            print('Decompressing', dataset['name'])
            # make a filename for output 
            tmp = re.search(r'(.*)\.gz$', dataset['filename'])
            output_filename = dataset_dir + tmp.group(1)
            with gzip.open(full_path, 'rb') as f_in:
                with open(output_filename, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)


In [None]:
# convert xes to csv
for xes_filename in glob.glob('../datasets/**/*.xes', recursive=True):
    # set csv filename
    csv_filename = re.sub(r'\.xes$', '.csv', xes_filename)
    print('Converting', xes_filename, 'to', csv_filename)
    # read a xes log file
    log = xes_importer.apply(xes_filename)
    # convert it to a data frame
    dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    print(dataframe.head())
    # save it as csv
    dataframe.to_csv(csv_filename, index=False)

In [None]:
# Change BPIC2014 and BPIC2016's delimiters to ','
import re
files = [
    '../datasets/BPIC2016_Clicks_Logged_In.csv',
    '../datasets/BPIC2016_Clicks_NOT_Logged_In.csv'
]
def convert(filename):
    print('convert the delimiter of', filename)
    outputs = open('../datasets/tmp.csv', 'w')
    inputs = open(filename, encoding='latin_1')
    for line in inputs:
        outputs.write(re.sub(';', ',', line))
[convert(x) for x in files]

In [None]:
# [pd.read_csv(x, nrows=2) for x in files]
pd.read_csv('../datasets/BPIC2014_change_log.csv', nrows=2)