In [2]:
# %load /home/jonathan/.ipython/profile_default/startup/01-setup.py
# start up settings for jupyter notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys

plt.style.use('ggplot')
plt.rcParams['font.size'] = 15.0
plt.rcParams['axes.labelsize'] = 15.0
plt.rcParams['xtick.labelsize'] = 15.0
plt.rcParams['ytick.labelsize'] = 15.0
plt.rcParams['legend.fontsize'] = 15.0

%matplotlib inline

# set the max column width
pd.options.display.max_colwidth = 1000

# to avoid have warnings from chained assignments
pd.options.mode.chained_assignment = None


In [9]:
import itertools as itrs
from podspy.structure import CausalMatrix
from podspy.log import constants as cnst
from podspy.log import factory as fty
from podspy.log import table as tb

In [16]:
# create event log
# Contains the following traces (with frequency)
# 45 <a, c, d>
# 42 <b, c, d>
# 38 <a, c, e>
# 22 <b, c, e>
t0 = ['a', 'c', 'd']
t1 = ['b', 'c', 'd']
t2 = ['a', 'c', 'e']
t3 = ['b', 'c', 'e']

traces = [t0, t1, t2, t3]

freq0 = 45
freq1 = 42
freq2 = 38
freq3 = 22

freqs = [freq0, freq1, freq2, freq3]

trace_freq_list = zip(traces, freqs)

caseid = 0
event_rows = []

for trace, freq in trace_freq_list:
    for i in range(freq):
        # repeat the caseid id over all events of a trace
        caseids = itrs.repeat(caseid, len(trace))

        event_row = zip(caseids, trace)
        event_rows += list(event_row)

        caseid += 1

labels = [cnst.CASEID, 'concept:name']
event_df = pd.DataFrame.from_records(event_rows, columns=labels)
trace_df = event_df[[cnst.CASEID]]
trace_df['concept:name'] = trace_df[cnst.CASEID]
trace_df = trace_df.drop_duplicates()

In [14]:
from opyenxes.factory.XFactory import XFactory
from opyenxes.extension.XExtensionManager import XExtensionManager
from opyenxes.extension.XExtension import XExtension
from opyenxes.classification.XEventAndClassifier import XEventAndClassifier
from opyenxes.classification.XEventLifeTransClassifier import XEventLifeTransClassifier
from opyenxes.classification.XEventNameClassifier import XEventNameClassifier
from opyenxes.classification.XEventResourceClassifier import XEventResourceClassifier

In [18]:
trace_attrib_to_factory = {
    'concept:name': XFactory.create_attribute_literal,
}

event_attrib_to_factory = {
    'concept:name': XFactory.create_attribute_literal
}


def get_extension(attrib_name):
    str_segments = attrib_name.split(':')
    
    if len(str_segments) < 2:
        return None
    
    prefix = str_segments[0]
    extension = XExtensionManager().get_by_prefix(prefix)
    return extension
    

def create_attribute(attrib, attrib_type, attrib_to_factory_map, extensions):
    factory_method = attrib_to_factory_map[attrib_type]
    
    # add extension to the attribute
    extension = None
    if ':' in attrib_type:
        extension = get_extension(attrib_type)

        assert isinstance(extension, XExtension), \
            'Extension is a {}'.format(str(type(extension)))

        extensions.add(extension)
    
    # create xattribute
    xattrib = factory_method(attrib_type, attrib, extension)
    
    return xattrib

    
def convert_event_row_to_xevent(event_attrib_types, event_row, extensions):
    attrib_map = XFactory.create_attribute_map()
    
    for ind in range(len(event_row)):
        attrib = event_row[ind]
        attrib_type = event_attrib_types[ind]
        
        xattrib = create_attribute(attrib, attrib_type,
                                   event_attrib_to_factory,
                                   extensions)
        
        # add attribute to attribute map
        attrib_map[xattrib.get_key()] = xattrib
        
    return XFactory.create_event(attrib_map)


def create_trace(trace_attrib_types, trace_row, 
                 event_attrib_types, event_df,
                 extensions):
    # assume that the event_rows are related to the trace and 
    # that there is no caseid column
    
    # need to add the trace attributes
    attrib_map = XFactory.create_attribute_map()
    
    for ind in range(len(trace_row)):
        attrib = trace_row[ind]
        attrib_type = trace_attrib_types[ind]
        
        xattrib = create_attribute(attrib, attrib_type, 
                                   trace_attrib_to_factory,
                                   extensions)
        
        # add attribute to attribute map
        attrib_map[xattrib.get_key()] = xattrib
        
    xtrace = XFactory.create_trace(attrib_map)
    
    # now add the events
    for row in event_df.iterrows():
        event_row = [row[1][key] for key in event_attrib_types]
        xevent = convert_event_row_to_xevent(event_attrib_types, 
                                             event_row, extensions)
        xtrace.append(xevent)
        
    return xtrace


def create_xlog(trace_df, event_df):
    xlog = XFactory.create_log()
    
    # iterate through trace rows
    trace_attrib_types = list(filter(
        lambda val: val != 'caseid', trace_df.columns
    ))
    
    event_attrib_types = list(filter(
        lambda val: val != 'caseid', event_df.columns
    ))

    extensions = set()
    
    for row in trace_df.iterrows():
        trace_row = [row[1][key] for key in trace_attrib_types]
                
        # get the events that are related to this trace
        caseid = row[1]['caseid']
        
#         print('Creating XTrace with caseid: {}'.format(caseid))
        
        events = event_df[(event_df['caseid'] == caseid)]
        
        xtrace = create_trace(trace_attrib_types, trace_row,
                             event_attrib_types, events, extensions)
        
        xlog.append(xtrace)
        
    # add the used extensions
    for ext in extensions:
        xlog.get_extensions().add(ext)
        
    # create a classifier that classifies using activity and lifecycle
    activity_clf = XEventNameClassifier()
    lifecycle_clf = XEventLifeTransClassifier()
    activity_lifecycle_clf = XEventAndClassifier([activity_clf, 
                                                  lifecycle_clf])
    activity_lifecycle_clf.set_name('Activity classifier')
    resource_clf = XEventResourceClassifier()
    resource_clf.set_name('Resource classifier')
    
    xlog.get_classifiers().append(activity_lifecycle_clf)
    xlog.get_classifiers().append(resource_clf)
        
    return xlog

In [19]:
xlog = create_xlog(trace_df, event_df)

In [20]:
from opyenxes.data_out.XesXmlGZIPSerializer import XesXmlGZIPSerializer

In [21]:
# save the log in XES format
xlog_fp = './data/small-log.xes.gz'

with open(xlog_fp, 'w') as f:
    XesXmlGZIPSerializer().serialize(xlog, f)

Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (21.628173828125 msec.)



In [24]:
%%bash
dtrx ./data/small-log.xes.gz
mv small-log.xes ./data/
head -n 20 ./data/small-log.xes

<?xml version="1.0" ?>
<!--This file has been generated with the OpenXES library. It conforms-->
<!--to the XML serialization of the XES standard for log storage and-->
<!--management.-->
<!--XES standard version: 1.0-->
<!--OpenXES library version: 1.0RC7-->
<!--OpenXES is available from http://www.openxes.org/-->
<log openxes.version="1.0RC7" xes.features="nested-attributes" xes.version="1.0">
	<extension name="Concept" prefix="concept" uri="http://www.xes-standard.org/concept.xesext"/>
	<classifier keys="concept:name lifecycle:transition" name="Activity classifier"/>
	<classifier keys="org:resource" name="Resource classifier"/>
	<trace>
		<string key="concept:name" value="0"/>
		<event>
			<string key="concept:name" value="a"/>
		</event>
		<event>
			<string key="concept:name" value="c"/>
		</event>
		<event>
