In [1]:
%load_ext autoreload
%autoreload 2
%load_ext memory_profiler

%matplotlib inline

In [2]:
from exp.eventlog import *

In [3]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [4]:
import networkx as nx
from nxpd import draw
from nxpd import nxpdParams
nxpdParams['show'] = 'ipynb'

In [5]:
%%time
%%memit
path=untar_data(URLs.BPIC_2012)
log=import_xes(path,extensions=False,classifiers=False,schema=False,log_attributes=False)

BPIC_2012/BPI_Challenge_2012.xes.gz
failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 2012-04-23T00:00:00.000+02:00
failed to parse date: 2011-10-01T00:38:44.546+02:00
failed to parse date: 2012-03-14T16:04:54.681+01:00
peak memory: 1403.04 MiB, increment: 1270.39 MiB
CPU times: user 6.88 s, sys: 433 ms, total: 7.31 s
Wall time: 7.41 s


In [6]:
df=log.events

In [7]:
group_by_case=df.sort_values(['trace_id','event_id'],ascending=True).groupby('trace_id')


In [8]:
aToi= {j:i for (i,j) in enumerate(df['concept:name'].drop_duplicates())}
iToa= {i:j for (i,j) in enumerate(df['concept:name'].drop_duplicates())}


In [9]:
uToi= {j:i for (i,j) in enumerate(df['org:resource'].drop_duplicates())}
iTou= {i:j for (i,j) in enumerate(df['org:resource'].drop_duplicates())}

In [None]:
def transistion_matrix(c,d,group_by_case):
    """
    Calculates the transition matrix for a pandas groupby (normally grouped by case)

    :param c: Column string
    :param d: Dict<string,id>
    :param group_by_case: pandas groupby (normally grouped by cases)

    """
    unique_len=len(d)
    F = np.zeros(shape=(unique_len,unique_len))
    for name, group in group_by_case:
        activities=group[c].values
        for i in range(0,len(activities)-1):
            F[d[activities[i]],d[activities[i+1]]]+=1

    return F


In [None]:
tm=transistion_matrix('concept:name',aToi,group_by_case)
_df=pd.DataFrame(tm,columns=aToi.keys())
_df.index=aToi.keys()
_df

In [None]:
def control_flow(F,d,node_freq,dpi=120,LR=True,xmin=5000,activity_frequency=1000):
    dpi=120
    rankdir='LR'
    G = nx.DiGraph()
    G.graph['dpi'] = dpi
    if LR:
        G.graph['rankdir'] = 'LR'
    x_min=xmin
    x_max=max(F.flatten())
    y_min = 1.0
    y_max = 5.0
    
    
    nodes=set()

    for (x,y), value in np.ndenumerate(F):
        if value>x_min:
            #print (iToa[x],iToa[y],value)
            yv = y_min + (y_max-y_min) * float(value-x_min) / float(x_max-x_min)
            if node_freq[d[x]]>activity_frequency:
                nodes.add(d[x])
            if node_freq[d[y]]>activity_frequency:
                nodes.add(d[y])
            if node_freq[d[x]]>activity_frequency and node_freq[d[y]]>activity_frequency:
                G.add_edge(d[x],d[y], label=''+str(value), penwidth=yv)

    for ai in nodes:
        text = ai + '\n (' + str(node_freq[ai]) + ')' 
        G.add_node(ai, label=text)

    return draw(G)

In [None]:
A_dict=df.groupby('concept:name').size().to_dict()



In [None]:
len(aToi)

In [None]:
def view(edge_frequence=1,activity_frequence=1,trace_frequence=1):
    _max=np.max(list(A_dict.values()))
    edge_frequence=_max/(edge_frequence+1)
    activity_frequence=_max/(activity_frequence+1)
    print(edge_frequence,activity_frequence)
    display(control_flow(tm,iToa,A_dict,activity_frequency=activity_frequence,xmin=edge_frequence,trace_frequence=trace_frequence,LR=False))


In [None]:
A_dict

In [None]:
%matplotlib inline
from ipywidgets import interactive
import matplotlib.pyplot as plt
import numpy as np


interactive_plot = interactive(view, edge_frequence=(1,100),activity_frequence=(1,100),trace_frequence=(1,100))
output = interactive_plot.children[-1]
output.layout.height = '1000px'
interactive_plot

In [None]:
def trace_vis(edges,dpi=120,rankdir='LR',inline=True):
    G = nx.DiGraph()
    G.graph['dpi'] = dpi
    G.graph['rankdir'] = rankdir
    i=0
    edges_dict={}
    for e in edges:
        key=f"{e[0]}-#-#{e[1]}"
        if not edges_dict.get(key,None): edges_dict[key]=str(e[2])
        else: edges_dict[key]+=f", {e[2]}"
    for k in edges_dict.keys():
        G.add_edge(*k.split('-#-#'),label=edges_dict[k])
    if inline:
        display(draw(G,show='ipynb'))
        
def most_frequent_traces(group_by_case,kmf=5):

    activity_strings=[]
    trace_ids=[]
    for n,g in group_by_case:
        activity_strings.append([n,'-'.join([str(aToi[i]) for i in g['concept:name'].values])])
       
    print('Number of traces:',len(activity_strings),'Sample:', activity_strings[0])
    number_of_traces=len(activity_strings)

    activity_frequences=pd.DataFrame(activity_strings, columns=['trace_id',"a"]).groupby('a')
    # Some Statistics about Trace frequencies
    count=activity_frequences.size()
    print('Mean',np.mean(count))
    print('Min',np.min(count),np.argmin(count.values))
    print('Max',np.max(count),np.argmax(count.values))
    print('Number of different activity traces',len(np.unique(list(count.index))))
    #display(count.plot())
    # Get Most frequent traces
    mostfrequent=pd.DataFrame(list(activity_frequences.a.count()),columns=['count'])
    mostfrequent['as']=activity_frequences.a.count().index
    mostfrequent=mostfrequent.sort_values('count',ascending=False)
    
    return mostfrequent
    
    
def vis_most_frequent_traces(mostfrequent,kmf=5):
    for i in [30,50,100,150,200,300,500]:
        print(f"sum of mostfrequent_{i}",np.sum(mostfrequent['count'].values[0:i]))
    #display_all(mostfrequent_30[mostfrequent_30['count'] >5])
    for i in [5,10,20,50,100,200,500]:
        print(f"sum of frequency_greater_{i}",np.sum(mostfrequent[mostfrequent['count'] >i]['count'].values))
    # Print 10 Most frequent traces
    k=0
    print(f"{kmf}. frequent traces:",mostfrequent[:kmf]['count'].sum(),'/',number_of_traces)
    print('---------------------------------')

    for index,row in mostfrequent[:kmf].iterrows():
        count,a=row['count'],row['as']
        k+=1
        a_=a.split('-')
        print(f"{k}. frequent",count)
        print(a)
        trace_vis([(iToa[int(a_[i])],iToa[int(a_[i+1])],i) for i in range(0,len(a_)-1)])

        
        

In [None]:
x={'a':{'x':1,'y':2},'b':{'x':3,'z':7}}
pd.DataFrame(x).to_dict()


In [10]:
%%time
case_trace_list=[]
for case,g in group_by_case:
    trace='-'.join([str(aToi[i]) for i in g['concept:name'].values])
    case_trace_list.append([case,trace])
    
       

CPU times: user 2.23 s, sys: 100 µs, total: 2.23 s
Wall time: 2.23 s


In [44]:
%%time



CPU times: user 2.12 s, sys: 0 ns, total: 2.12 s
Wall time: 2.12 s


Unnamed: 0_level_0,count
trace,Unnamed: 1_level_1
0-1-18,3429
0-1-20-20-18-20,1872
0-1-20-20-20-20-18-20,271
0-1-20-20-2-3-20-3-18-3,209
0-1-2-3-3-18-3,160
0-1-2-3-3-19-3,134
0-1-20-20-2-3-20-3-3-3-18-3,126
0-1-2-3-3-3-3-18-3,93
0-1-2-3-3-3-3-19-3,87
0-1-20-20-2-3-20-3-3-3-3-3-18-3,74


In [22]:
case_trace_list

trace_id
173688    0-1-2-3-3-4-5-6-7-8-9-3-9-9-9-9-9-10-11-9-11-1...
173691    0-1-2-3-3-3-3-4-6-5-7-8-9-3-9-9-9-5-16-7-8-9-9...
173694    0-1-2-3-3-3-3-3-3-3-3-4-6-5-7-8-9-3-9-5-16-7-8...
173697                                               0-1-18
173700                                               0-1-18
                                ...                        
214364    0-1-2-3-3-3-3-4-6-5-7-8-9-3-9-5-16-7-8-9-9-9-9...
214367                                               0-1-18
214370                                      0-1-20-20-18-20
214373                 0-1-20-20-2-3-20-3-4-5-6-7-8-9-3-9-9
214376                                      0-1-20-20-18-20
Length: 13087, dtype: object

Unnamed: 0_level_0,trace
trace_id,Unnamed: 1_level_1
173688,0-1-2-3-3-4-5-6-7-8-9-3-9-9-9-9-9-10-11-9-11-1...
173691,0-1-2-3-3-3-3-4-6-5-7-8-9-3-9-9-9-5-16-7-8-9-9...
173694,0-1-2-3-3-3-3-3-3-3-3-4-6-5-7-8-9-3-9-5-16-7-8...
173697,0-1-18
173700,0-1-18
173703,0-1-2-3-3-3-3-19-3
173706,0-1-20-20-2-3-20-3-3-3-3-3-18-3
173709,0-1-2-3-3-3-3-3-3-3-3-19
173712,0-1-20-20-2-3-20-3-3-3-3-3-19-3
173715,0-1-2-3-3-3-3-4-5-6-7-8-9-3-9-9-9-10-11-9-11-1...


CPU times: user 4.36 ms, sys: 13 µs, total: 4.38 ms
Wall time: 3.94 ms


In [42]:
_df

Unnamed: 0_level_0,case
trace,Unnamed: 1_level_1
0-1-2-3-3-4-5-6-7-8-9-3-9-9-9-9-9-10-11-9-11-12-13-14-15-11,173688
0-1-2-3-3-3-3-4-6-5-7-8-9-3-9-9-9-5-16-7-8-9-9-9-9-9-10-11-9-11-11-11-11-11-14-13-12-15-11,173691
0-1-2-3-3-3-3-3-3-3-3-4-6-5-7-8-9-3-9-5-16-7-8-9-9-9-9-9-9-9-9-9-5-16-7-8-9-9-9-9-9-9-9-9-9-9-9-9-9-10-11-9-11-14-13-12-15-11-17,173694
0-1-18,173697
0-1-18,173700
0-1-2-3-3-3-3-19-3,173703
0-1-20-20-2-3-20-3-3-3-3-3-18-3,173706
0-1-2-3-3-3-3-3-3-3-3-19,173709
0-1-20-20-2-3-20-3-3-3-3-3-19-3,173712
0-1-2-3-3-3-3-4-5-6-7-8-9-3-9-9-9-10-11-9-11-18-21-11,173715


Unnamed: 0_level_0,count
trace,Unnamed: 1_level_1
0-1-18,3429
0-1-20-20-18-20,1872
0-1-20-20-20-20-18-20,271
0-1-20-20-2-3-20-3-18-3,209
0-1-2-3-3-18-3,160
0-1-2-3-3-19-3,134
0-1-20-20-2-3-20-3-3-3-18-3,126
0-1-2-3-3-3-3-18-3,93
0-1-2-3-3-3-3-19-3,87
0-1-20-20-2-3-20-3-3-3-3-3-18-3,74


In [None]:
_df

In [None]:
_df['trace]_df.index

In [None]:
%time
_df=pd.DataFrame(activity_strings).groupby(1)
taces=[]
for n,g in group_by_case:

_df.count().sort_values(0,ascending=False)[:10]

In [None]:
%%timeit -n 10
activity_strings=[]
trace_ids=[]
for n,g in group_by_case:
    activity_strings.append([n,'-'.join([str(aToi[i]) for i in g['concept:name'].values])])

#print('Number of traces:',len(activity_strings),'Sample:', activity_strings[0])
number_of_traces=len(activity_strings)

activity_frequences=pd.DataFrame(activity_strings, columns=['trace_id',"a"])
af_dict=activity_frequences.groupby('a').size().to_dict()

sizes=[]
for _,row in activity_frequences.iterrows():
    
    sizes.append(af_dict[row['a']])

activity_frequences['frequencies']=sizes
mft=activity_frequences.groupby('a').mean().sort_values('frequencies',ascending=False)
mft[:10]

In [55]:
#export
class BasicPD:
    def __init__(self,eventLog):
        self.eventLog=eventLog

        self.groupByTraces=eventLog.events.sort_values(['trace_id','event_id'],ascending=True).groupby('trace_id')
        self.aToi={j:i for (i,j) in enumerate(eventLog.events['concept:name'].drop_duplicates())}
        self.iToa={i:j for (i,j) in enumerate(self.aToi)}
        self.uToi={j:i for (i,j) in enumerate(eventLog.events['org:resource'].drop_duplicates())}
        self.iTou={i:j for (i,j) in enumerate(self.uToi)}
        self.cfm=self.transition_matrix('concept:name',self.aToi,self.groupByTraces)
        self.hwm=self.transition_matrix('org:resource',self.uToi,self.groupByTraces)
        self.traceTocase=None
        self.mft,self.traceTocase=self.calc_mft()
    
    def transition_matrix(self,c,d,group_by_case):
        """
        :param c: Column string
        :param d: Dict<string,id>
        :param group_by_case: pandas groupby (normally grouped by cases)
        """
        
        unique_len=len(d)
        F = np.zeros(shape=(unique_len,unique_len))
        for name, group in group_by_case:
            activities=group[c].values
            for i in range(0,len(activities)-1):
                F[d[activities[i]],d[activities[i+1]]]+=1

        return F
    
    def _trace_rep(g):
            return '-'.join([str(aToi[i]) for i in g['concept:name'].values])
    
    def calc_mft(self):
    
        
        if not self.traceTocase:
            case_trace_list=group_by_case.apply(_trace_rep)
            df=pd.DataFrame(case_trace_list,columns=['trace'])

            df['case']=df.index
            df.index=df['trace']
            df=df[['case']]

        mft=pd.DataFrame(df.groupby(level=0).size(),columns=['count']).sort_values('count',ascending=False)
        return mft,df

%time bpd=BasicPD(log)


CPU times: user 8.16 s, sys: 0 ns, total: 8.16 s
Wall time: 8.15 s


array([[0.0000e+00, 1.3087e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.0000e+00, 0.0000e+00, 4.8520e+03, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 3.4290e+03, 0.0000e+00,
        4.7390e+03, 0.0000e+00, 0.0000e+00, 6.7000e+01],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, 7.3670e+03, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.000