# <b>4 - Experiments</b>

<b>Purpose:</b>   
Testing on various features.
   
-----

<b>Initial Imports</b>   
Imports for specific techniques handled later

In [2]:
import random
from datetime import datetime, timedelta
import pandas as pd
import pm4py

-----

## <b>Simulation</b>

Summary of the random process:   
There are six forms of the process flow. All begin with 'start' and all terminate with 'end'. There's an 80% chance that the process will follow the first two flows. There's a 14% chance of an approval required on a normal flow. There's a 5% chance of a special approval flow. There's a 1% chance of a process bypass. A random number is used to simulate which flow is followed.   
However, for a random interval, the probabilities change an another flow is 89% likely (was 5%).  
10,000 simulations were completed.

In [3]:
process_scenarios = {
                    "a": ['start','a','b','c','end'],
                    "b": ['start','a','d','e','end'],
                    "c": ['start','a','b','c','a_approval','end'],
                    "d": ['start','a','d','e','b_approval','end'],
                    "e": ['start','a','f','f_approval','end'],
                    "f": ['start','bypass','end']
                    }

In [4]:
def get_action_date(f_days):
    process_start_date = datetime(2001,1,1)
    return process_start_date + timedelta(f_days)
    

In [5]:
def get_scenario(f_rand_val, f_scenarios, f_outlier):
    if f_outlier == 0:
        tmp_val = ''
        tmp_val = f_scenarios['a'] if (f_rand_val <= 40) else tmp_val                              # 40% chance A
        tmp_val = f_scenarios['b'] if (f_rand_val > 40 and f_rand_val <= 80) else tmp_val          # 40% chance B
        tmp_val = f_scenarios['c'] if (f_rand_val > 80 and f_rand_val <= 87) else tmp_val          # 7% chance C
        tmp_val = f_scenarios['d'] if (f_rand_val > 87 and f_rand_val <= 94) else tmp_val          # 7% chance D
        tmp_val = f_scenarios['e'] if (f_rand_val > 94 and f_rand_val <= 99) else tmp_val          # 5% chance E
        tmp_val = f_scenarios['f'] if (f_rand_val > 99) else tmp_val                               # 1% chance F
    if f_outlier == 1:
        tmp_val = ''                                                                               # 0% chance A or B
        tmp_val = f_scenarios['c'] if (f_rand_val <= 5) else tmp_val                               # 5% chance C
        tmp_val = f_scenarios['d'] if (f_rand_val > 5 and f_rand_val <= 10) else tmp_val           # 5% chance D
        tmp_val = f_scenarios['e'] if (f_rand_val > 10 and f_rand_val <= 99) else tmp_val          # 89% chance E
        tmp_val = f_scenarios['f'] if (f_rand_val > 99) else tmp_val                               # 1% chance F
    return tmp_val

In [6]:
def simulate_event(f_flow_id, f_date_id, f_outlier_flag):
    rand_event = int( round( random.random() * 100 , 2))
    tmp_date_id = f_date_id
    flow_scenario = get_scenario( rand_event, process_scenarios ,f_outlier_flag)
    tmp_flow = []
    for flow in flow_scenario: 
        event_date = get_action_date(tmp_date_id)
        tmp_date_id += 1
        tmp_flow.append([f_flow_id, event_date.strftime('%m/%d/%Y'), flow])
    return tmp_flow, tmp_date_id

In [7]:
date_id = 1
flow_id = 1
simulation_list = []
while flow_id < 10001:
    if flow_id > 5000 and flow_id <= 5501: 
        outlier_flag = 1
    else: 
        outlier_flag = 0
    sim_flow, date_id = simulate_event(flow_id, date_id,outlier_flag)
    for event in sim_flow:
        simulation_list.append(event)
    flow_id += 1

In [8]:
simulation_df = pd.DataFrame(simulation_list)

In [9]:
simulation_df.columns = ['flow_id','event_date','event']

----- 

## <b>Process Mining</b>

In this section, two flows will be shown: (1) expected flow, (2) changed flow.   
Next, the analysis decribed will be performed.  
Then, the results of the analysis will be shown,   
and the flows resulting from the analysis will be visualized. 

### <b>Initial set up</b>

In [10]:
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter

In [11]:
sim_event_df = dataframe_utils.convert_timestamp_columns_in_df(simulation_df)

In [12]:
sim_event_df = sim_event_df.sort_values('event_date')

In [13]:
sim_event_df.columns = ['case:concept:name','time:timestamp','concept:name']

In [14]:
sim_normevent_df = sim_event_df[sim_event_df['case:concept:name'] < 5000]
sim_outevent_df = sim_event_df[(sim_event_df['case:concept:name'] > 5000) & (sim_event_df['case:concept:name'] < 5500) ]

In [15]:
log_util_params = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:concept:name'}

In [16]:
sim_event_log = log_converter.apply(sim_event_df, parameters=log_util_params, variant=log_converter.Variants.TO_EVENT_LOG)
sim_normevent_log = log_converter.apply(sim_normevent_df, parameters=log_util_params, variant=log_converter.Variants.TO_EVENT_LOG)
sim_outevent_log = log_converter.apply(sim_outevent_df, parameters=log_util_params, variant=log_converter.Variants.TO_EVENT_LOG)

In [17]:
from pm4py.objects.log.util import get_log_representation

In [46]:
data, feature_names = get_log_representation.get_representation(sim_event_log, str_ev_attr=["concept:name"],
                                                                str_tr_attr=[], num_ev_attr=[], num_tr_attr=[],
                                                                str_evsucc_attr=[])

In [47]:
test_df = pd.DataFrame(data, columns=[feature.replace('event:concept:name@','') for feature in feature_names])

In [48]:
test_df.head()

Unnamed: 0,a,a_approval,b,b_approval,c,d,e,end,f,f_approval,start
0,1,0,0,0,0,1,1,1,0,0,1
1,1,0,1,0,1,0,0,1,0,0,1
2,1,0,0,0,0,1,1,1,0,0,1
3,1,0,0,0,0,1,1,1,0,0,1
4,1,0,1,0,1,0,0,1,0,0,1


In [52]:
data2, feature_names2 = get_log_representation.get_representation(sim_event_log, str_ev_attr=[],
                                                                str_tr_attr=["concept:name"], num_ev_attr=[], num_tr_attr=[],
                                                                str_evsucc_attr=[])

In [53]:
test2_df = pd.DataFrame(data2, columns=[feature.replace('trace:concept:name@','') for feature in feature_names2])

In [54]:
test2_df.head()

Unnamed: 0,1,10,100,1000,10000,1001,1002,1003,1004,1005,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
test2_df.sum()

1        1
10       1
100      1
1000     1
10000    1
        ..
9995     1
9996     1
9997     1
9998     1
9999     1
Length: 10000, dtype: int64