In [1]:
import pandas as pd
import datetime
from os import listdir
import os, os.path
from os.path import isfile, join
import numpy as np
import csv
import re
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.util import dataframe_utils
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#add the data to notebook directory
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [3]:
#read the data
df_prueba = pd.read_csv(parentdir+"\\Log of analysis of time aspects.csv",index_col=0)

In [4]:
#we grouped the original data depending on the answers
df_analisis = df_prueba.groupby(by=['case:concept:name'])
#we create a new dataframe with the operations of each answer
df_agrupado = pd.DataFrame(columns=['case:concept:name','Operations','Organization', 'Year', 'Question'])
for name, group in df_analisis:#now we fill the new dataframe using the grouped dataframe
    org = group.iloc[0]['Organization']#get the organization related to the answer
    year = group.iloc[0]['Year']#get the year related to the year
    question = group.iloc[0]['Question']#get the question related to the question
    lista = group['Variant'].to_list()#get the variants related to the answer
    
    #create a new row in a dictionary format
    new_row={'case:concept:name':name, 'Operations':lista, 'Organization':org, 'Year':year, 'Question':question} 
    df_agrupado = df_agrupado.append(new_row, ignore_index=True)#finally we add it to the new dataframe

In [5]:
df_agrupado

Unnamed: 0,case:concept:name,Operations,Organization,Year,Question
0,A1 P1 BPI 2017,[Calculate cycle time of the whole process for...,University of Liechtenstein\nVienna University...,2017,P1
1,A1 P1-P2 BPI 2020,"[Filter traces by activities, Calculate cycle ...","Technische Universität, Berlin",2020,P1-P2
2,A1 P2 BPI 2019,"[Group events by time, Calculate cycle time of...",University of Melbourne,2019,P2
3,A1 P5 BPI 2015,[Calculate cycle time of the whole process for...,Meijer & Van der Ham Management Consultants,2015,P5
4,A1 P5 BPI 2020,"[Filter traces by activities, Calculate cycle ...","Technische Universität, Berlin",2020,P5
...,...,...,...,...,...
105,A8 P1 BPI 2017,"[Filter traces by activities, Calculate waitin...",Universidade Federal do Estado do Rio de Janeiro,2017,P1
106,A8 P5 BPI 2015,[Calculate cycle time of the whole process for...,GRADIENT ECM,2015,P5
107,A9 P1 BPI 2017,"[Calculate processing time, Calculate waiting ...",POSTECH,2017,P1
108,A9 P2 BPI 2019,"[Filter traces by year, Calculate cycle time o...",KPMG Netherlands,2019,P2


In [6]:
#function to calculate the sorensen index using sets
def sorensenIndex(list1,list2):
    set1=set(list1)
    set2=set(list2)
    intersection = set1.intersection(set2)
    len1=len(list(set1))
    len2=len(list(set2))
    sorensenIndex=float(2*len(list(intersection)))/(len1+len2)
    return sorensenIndex

In [7]:
#we filter the answers related to 2020, which is the BPI with the predominant organization (whose name is sberbank)
df_2020 = df_agrupado[df_agrupado['Year']==2020]
df2 = df_2020.groupby(by=['Year', 'Question'])#we group the answers depending on the year and the question
columns=['analisis','organizaciones','sorensen']#names of the columns of a new dataframe where the comparations between answers
#will be saved: 
#analisis (analysis->i.e pairs of answers involved in the comparison), organizaciones (organizations), sorensen (index value)
df_metricas = pd.DataFrame(columns=columns)

for name, group in df2:#now we do three comparisons:
    #No sberbank- No sberbank:represents the comparisons between answers of other organizations (not the predominant)
    #No sberbank-Sberbank:represents the comparisons between answers of other organizations and the predominant
    #Sberbank-Sberbank: represents the comparisons between answers of the predominant organization
    for i in range(len(group)):
        if(group.iloc[i]['Organization'] != 'Sberbank'):
            org1 = 'No_Sberbank'
        else:
            org1 = 'Sberbank'
        
        for j in range(i+1,len(group)):
            grupo = group.iloc[i]['Operations']
            analisis = group.iloc[i]['case:concept:name'] + "-" + group.iloc[j]['case:concept:name']
                
            if(group.iloc[j]['Organization'] != 'Sberbank'):
                org2 = 'No_Sberbank'
            else:
                org2 = 'Sberbank'
            org = org1 + "-" + org2
            sorensen = sorensenIndex(grupo,group.iloc[j]['Operations'])
            new_row={'analisis':analisis,'organizaciones':org,'sorensen':sorensen} 
            df_metricas = df_metricas.append(new_row, ignore_index=True)
        
df_metricas = df_metricas.replace({'Sberbank-No_Sberbank':'No_Sberbank-Sberbank'})
df_metricas 

Unnamed: 0,analisis,organizaciones,sorensen
0,A1 P1-P2 BPI 2020-A10 P1-P2 BPI 2020,No_Sberbank-No_Sberbank,0.181818
1,A1 P1-P2 BPI 2020-A11 P1-P2 BPI 2020,No_Sberbank-Sberbank,0.666667
2,A1 P1-P2 BPI 2020-A12 P1-P2 BPI 2020,No_Sberbank-Sberbank,0.400000
3,A1 P1-P2 BPI 2020-A13 P1-P2 BPI 2020,No_Sberbank-Sberbank,0.400000
4,A1 P1-P2 BPI 2020-A16 P1-P2 BPI 2020,No_Sberbank-No_Sberbank,0.166667
...,...,...,...
552,A30 P6 BPI 2020-A37 P6 BPI 2020,No_Sberbank-Sberbank,0.000000
553,A30 P6 BPI 2020-A7 P6 BPI 2020,No_Sberbank-No_Sberbank,0.666667
554,A35 P6 BPI 2020-A37 P6 BPI 2020,No_Sberbank-Sberbank,0.000000
555,A35 P6 BPI 2020-A7 P6 BPI 2020,No_Sberbank-No_Sberbank,0.000000


In [8]:
#get the avg of the sorensen index of each group of comparisons:
df_metricas.groupby(by=['organizaciones']).mean()

Unnamed: 0_level_0,sorensen
organizaciones,Unnamed: 1_level_1
No_Sberbank-No_Sberbank,0.165895
No_Sberbank-Sberbank,0.173252
Sberbank-Sberbank,0.124987


In [9]:
from scipy import stats

group1 =  df_metricas[df_metricas['organizaciones'] == 'Sberbank-Sberbank']
group2 =  df_metricas[df_metricas['organizaciones'] == 'No_Sberbank-Sberbank']
group3 =  df_metricas[df_metricas['organizaciones'] == 'No_Sberbank-No_Sberbank']
# stats.kruskal(group1, group2, group3)

In [10]:
#get the variance of the sorensen index of each group of comparisons:
df_metricas.groupby(by=['organizaciones'])['sorensen'].var(ddof=0)

organizaciones
No_Sberbank-No_Sberbank    0.046052
No_Sberbank-Sberbank       0.043726
Sberbank-Sberbank          0.035523
Name: sorensen, dtype: float64

In [11]:
#get the standard deviation of the sorensen index of each group of comparisons:
df_metricas.groupby(by=['organizaciones'])['sorensen'].std(ddof=0)

organizaciones
No_Sberbank-No_Sberbank    0.214598
No_Sberbank-Sberbank       0.209109
Sberbank-Sberbank          0.188477
Name: sorensen, dtype: float64