In [4]:
import pm4py
import datetime
import pandas as pd
import time
import statistics
import numpy as np
pd.options.display.max_colwidth = 500
import warnings
warnings.filterwarnings("ignore")

In [4]:
def checkDatetime(df,time,formatDate=None):
    """
    This function checks if the timestamp of a dataframe is a string or not
    
    Inputs:
    df:dataframe which represents the log
    time:name of the column of timestamp in the dataframe
    formatDate: if the date is not a string,it is required a format to pass it to timestamp
    
    Output:
    a dataframe with the date fixed
    """
    
    now = datetime.datetime.now()
    prueba=df[time][0]#some date is filtered
    if isinstance(prueba, datetime.datetime):#it is checked if the date is an instance of the package datetime
        print("It is correct")
    else:#comprobar que esta conversion funciona
        #df[time]=pd.to_datetime(df[time], format=formatDate)->conversion of string to timestamp, a format is required
        print("It is not correct")
    return df

In [2]:
def calculateCycleTimeOfEachPair(df,case_id,time,activity):
    """
    This function calculates the cycle time between each pair of events in a log
    
    Inputs:
    df:dataframe which represents the log
    case_id: name of the column which represent the cases in the dataframe
    time: name of the column of the timestamps in the dataframe
    activity: name of the column of the activities in the dataframe
    
    Outputs:
    the dataframe of the input with four different columns:
    Initial activity:activity that starts a transition
    Final activity: activity that ends a transition
    Cycle time: cycle time of the transition
    Transition: tuple of initial activity and final activity
    
    """
    #lists to store:
    tiempos=[]#cycle time of activities
    actividadesPrevias=[]#the initial activity in a path
    actividadesFinales=[]#the final activity in a path
    tuples=[]

    
    for i in range(len(df)-1): #for each row
   
        fila=df.iloc[i]#get a row i
        fila_siguiente=df.iloc[i+1]# get the next the row i+1
        #fila_anterior=df.iloc[i-1]
        
        if fila[case_id]==fila_siguiente[case_id]:#if these rows belong to the same case:
            tiempos.append((fila_siguiente[time]-fila[time]).total_seconds() /60)#calculate the cycle time
            actividadesPrevias.append(fila[activity])#add the activity of the row i, which is the inital activity of the path
            actividadesFinales.append(fila_siguiente[activity])#add the activity of the row i, which is the final activity of the path
            tuples.append((fila[activity],fila_siguiente[activity]))
            
        #elif  i<0 and fila[case_id]==fila_anterior[case_id]:
                #tiempos,actividadesPrevias,actividadesFinales=tiempos.append("-"),actividadesPrevias.append(fila_anterior[activity]),actividadesFinales.append("-")
            
        else:#if the rows do not belong to the same case, cycle time is not calculated.
            tiempos.append(0)
            actividadesPrevias.append("-")
            actividadesFinales.append("-")
            tuples.append("-")
            
    #the last activity of the log receives these values: 
    tiempos.append(0)
    actividadesPrevias.append("-")
    actividadesFinales.append("-")
    tuples.append("-")
    
    #new columns are added with these values:
    df['Initial Activity']=actividadesPrevias
    df['Final Activity']=actividadesFinales
    df['Cycle time']=tiempos
    df['Transition']=tuples

    return df

In [11]:
def calculateCycleProcess(df_cycle_time,case_id,time,activity,measure):
    """
    This function calculates the average or median of the cycle time of a process
    
    Inputs:
    df_cycle_time:dataframe which represents the log
    case_id: name of the column which represent the cases in the dataframe
    time: name of the column of the timestamps in the dataframe
    activity: name of the column of the activities in the dataframe
    measure:string which represents the measure (if it is "Average" the average will be calcultad.
    In other cases the median will be calculated )
    
    Output:
    The mean or median of the cycle time of the process.
    
    
    """
    
    if measure=="Average":
        #df_cycle_time[df_cycle_time['Cycle time']!="-"]['Cycle time']->filter the cycle time values
        #np.mean-> calculate the mean of all the filtered values
        #df_cycle_time[df_cycle_time['Cycle time']!="-"]['Cycle time'])/3600-> pass it to hours
        cycleTime=np.mean(df_cycle_time[df_cycle_time['Cycle time']!=0]['Cycle time'])/3600
        print(cycleTime)
        
    else:
        #the same idea is applied here, with the difference that median is calculated:
        cycleTime=statistics.median(df_cycle_time[df_cycle_time['Cycle time']!=0]['Cycle time'])/3600
        print(cycleTime)
                                            
    return cycleTime

In [74]:
def calculateBottlenecksProcess(df_cycle_time,activity,measure):
    """
    This function calculates the bottlenecks of a process in temrs of the cycle time
    
    Inputs:
    df_cycle_time:dataframe which represents the log
    activity: name of the column of the activities in the dataframe
    measure:string which represents the measure to be considered to detect the bottleneck:
        If it is "Average", the activity with the greatest average of cycle time will be considered as bottleneck
        If it is "Max", the event with the greatest cycle time will be considered as bottleneck
        If if is none of the previous strings, the activity with the greatest median of the cycle time will be 
        considered as bottleneck
    
    Output:
    The activity which is considered as bottleneck and his cycle time measure
    
    
    """
    
    
    valueBottleneckCycleTime=0
    bottleneckActivity=""
    
    if measure=="Average":#calculate the activity which has the greatest average of cycle time
        
        #prueba[prueba['Cycle time']!="-"]-> All activities which have a cycle time value are filtered
        #groupby([activity])-> rows are grouped by activities
        #apply(lambda x: np.mean(x['Cycle time']))-> the average of cycle time is calculated for each group
        #sort_values(ascending=False)/3600-> it is sorted to get the greatest first and it is divided between 3600 to show the hours
        cycleTime=df_cycle_time[df_cycle_time['Cycle time']!=0].groupby([activity]).apply(lambda x: np.mean(x['Cycle time'])).sort_values(ascending=False)/3600
        bottleneckActivity=cycleTime.index.tolist()[0]
        valueBottleneckCycleTime=cycleTime[[1]]
        print("The activity with the greatest average of cycle time is: "+ str(bottleneckActivity))
        
        
    elif measure=="Max": #calculate the event with the biggest cycle time:
        
        cycleTime=max(df_cycle_time[df_cycle_time['Cycle time']!=0]['Cycle time'])#find the maximum value
        rowMax=df_cycle_time[df_cycle_time['Cycle time']==cycleTime]#filter the row with the maximum value
        #cycleTime=cycleTime/3600# it is converted to minutes
        
        bottleneckActivity=cycleTime
        valueBottleneckCycleTime=rowMax['Initial Activity']
        
        print("Max combination is:"+" "+rowMax['Initial Activity']+" --> "+rowMax['Final Activity']+" , "+"Max cycle time: "+ str(cycleTime))
        
        
    else:#Median: calculated the activity with the biggest median
        
        #It is the same idea of average, with the difference that the median is calculated instead of the average.
        cycleTime=df_cycle_time[df_cycle_time['Cycle time']!=0].groupby(['concept:name']).apply(lambda x: np.median(x['Cycle time'])).sort_values(ascending=False)/3600
        bottleneckActivity=cycleTime.index.tolist()[0]
        valueBottleneckCycleTime=cycleTime.index.tolist()[0]
        print("The activity with the greatest median of cycle time is: "+ bottleneckActivity)
        
                                            
    return bottleneckActivity,valueBottleneckCycleTime

In [21]:
def calculateCycleTimeOfEachPairForEachSubdataframe(case_id,time,activity,*dataframes):
    """
    This function calculates the cycle time between each pair of events for several dataframes
    
    Inputs:
    
    case_id: name of the column which represent the cases in the dataframe
    time: name of the column of the timestamps in the dataframe
    activity: name of the column of the activities in the dataframe
    dataframes: dataframes which represents the logs
    
    Outputs:
    several dataframes of the inputs with four different columns:
    Initial activity:activity that starts a transition
    Final activity: activity that ends a transition
    Cycle time: cycle time of the transition
    Transition: tuple of initial activity and final activity
    
    """
    print("The results obtained are stored in the following variables:")
    cont = 0
    for df in dataframes:
        
        #lists to store:
        tiempos=[]#cycle time of activities
        actividadesPrevias=[]#the initial activity in a path
        actividadesFinales=[]#the final activity in a path
        tuples=[]


        for i in range(len(df)-1): #for each row

            fila=df.iloc[i]#get a row i
            fila_siguiente=df.iloc[i+1]# get the next the row i+1
            #fila_anterior=df.iloc[i-1]

            if fila[case_id]==fila_siguiente[case_id]:#if these rows belong to the same case:
                tiempos.append((fila_siguiente[time]-fila[time]).total_seconds() /60)#calculate the cycle time
                actividadesPrevias.append(fila[activity])#add the activity of the row i, which is the inital activity of the path
                actividadesFinales.append(fila_siguiente[activity])#add the activity of the row i, which is the final activity of the path
                tuples.append((fila[activity],fila_siguiente[activity]))

            #elif  i<0 and fila[case_id]==fila_anterior[case_id]:
                    #tiempos,actividadesPrevias,actividadesFinales=tiempos.append("-"),actividadesPrevias.append(fila_anterior[activity]),actividadesFinales.append("-")

            else:#if the rows do not belong to the same case, cycle time is not calculated.
                tiempos.append(0)
                actividadesPrevias.append("-")
                actividadesFinales.append("-")
                tuples.append("-")

        #the last activity of the log receives these values: 
        tiempos.append(0)
        actividadesPrevias.append("-")
        actividadesFinales.append("-")
        tuples.append("-")

        #new columns are added with these values:
        df['Initial Activity']=actividadesPrevias
        df['Final Activity']=actividadesFinales
        df['Cycle time']=tiempos
        df['Transition']=tuples
        globals()["df_" + str(cont)] = df
        print("df_" + str(cont))
        cont = cont + 1
        


In [32]:
def calculateCycleProcessForSubdataframes(case_id,time,activity,measure,*dataframes):
    """
    This function calculates the average or median of the cycle time of a process
    
    Inputs:
    case_id: name of the column which represent the cases in the dataframe
    time: name of the column of the timestamps in the dataframe
    activity: name of the column of the activities in the dataframe
    measure:string which represents the measure (if it is "Average" the average will be calcultad.
    In other cases the median will be calculated )
    dataframes: several dataframes which represents the logs
    
    Output:
    The mean or median of the cycle time of the processes.
    
    
    """
    
    cont = 0
    
    for df_cycle_time in dataframes:
        print("Cycle time dataframe", cont, ":")
        
        if measure=="Average":
            #df_cycle_time[df_cycle_time['Cycle time']!="-"]['Cycle time']->filter the cycle time values
            #np.mean-> calculate the mean of all the filtered values
            #df_cycle_time[df_cycle_time['Cycle time']!="-"]['Cycle time'])/3600-> pass it to hours
            cycleTime=np.mean(df_cycle_time[df_cycle_time['Cycle time']!=0]['Cycle time'])/3600
            print(cycleTime)

        else:
            #the same idea is applied here, with the difference that median is calculated:
            cycleTime=statistics.median(df_cycle_time[df_cycle_time['Cycle time']!=0]['Cycle time'])/3600
            print(cycleTime)
            
        cont = cont+1

#         return cycleTime