In [1]:
import pm4py
import csv
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime
import os
import pickle

## Investigating Matrix-2 being reinvested in when 6th place is taken

In [2]:
# load balance sheet (see 1_Forsage_Ether_analysis.ipynb)

curr_dir = os.getcwd()
dir_path = os.path.dirname(curr_dir)

file = "balance"
path = os.path.join(dir_path, "resources", file + ".pkl")
balance = pickle.load(open(path, 'rb'))

In [3]:
# load log as dataframe (see 2_Forsage_log_sampling.ipynb)

curr_dir = os.getcwd()
dir_path = os.path.dirname(curr_dir)

file = "df_log"
path = os.path.join(dir_path, "resources", file + ".pkl")
df_log = pickle.load(open(path, 'rb'))

In [13]:
# load sampled user addresses (see 2_Forsage_log_sampling.ipynb)

curr_dir = os.getcwd()
dir_path = os.path.dirname(curr_dir)

loaded_data_list = [None, None, None]
file_list = ["pyramide_top", "pyramide_center_top", "pyramide_bottom"]
counter_list = [0, 1, 2]

for i in counter_list:
    path = os.path.join(dir_path, "resources", file_list[i] + ".pkl")
    loaded_data_list[i] = pickle.load(open(path, 'rb'))

pyramide_top, pyramide_center_top, pyramide_bottom = loaded_data_list

In [14]:
# build restructured log
df_restructured = df_log.copy()
nm1_mask = df_log["concept:name"] == "New User-Place Matrix-1"
nm2_mask = df_log["concept:name"] == "New User-Place Matrix-2"

df_restructured.loc[nm1_mask, "old_case:concept:name"] = df_restructured["case:concept:name"][nm1_mask]
df_restructured.loc[nm2_mask, "old_case:concept:name"] = df_restructured["case:concept:name"][nm2_mask]

df_restructured.loc[nm1_mask, "case:concept:name"] = df_restructured["referrer"][nm1_mask]
df_restructured.loc[nm2_mask, "case:concept:name"] = df_restructured["referrer"][nm2_mask]
df_restructured.loc[nm1_mask, "case:ident:piid"] = df_restructured["referrer"][nm1_mask]
df_restructured.loc[nm2_mask, "case:ident:piid"] = df_restructured["referrer"][nm2_mask]

df_restructured.drop(df_restructured.index[~(nm2_mask)], inplace=True)
df_restructured.reset_index(inplace=True)
df_restructured.drop(columns=["index"], inplace=True)

In [15]:
# create smaller data sets for logs
# Combine sign-ups is a matrix with reinvestment-events. That means concatenating the normal log and the restructured log.
def log_cutter_restructured(pyramide_slice):
    mask = df_restructured["case:concept:name"].isin(pyramide_slice)
    df_sublog = df_restructured[mask]
    # add place to activity label
    mask_place = df_sublog["place"] != ""
    df_sublog.loc[:, 'place'] = df_sublog['place'].apply(str).str[:-2]
    df_sublog.iloc[(mask_place).values, 4] = df_sublog[["concept:name", "place"]][mask_place].apply(" : Place ".join, axis=1)
    
    # concatenate
    mask_user = df_log["case:concept:name"].isin(pyramide_slice)
    df_restructured_concat = pd.concat([df_sublog, df_log[mask_user]], ignore_index=True, sort=False)
    # set timestamp as such
    df_restructured_concat['time:timestamp'] = pd.to_datetime(df_restructured_concat['time:timestamp'])
    # sort
    df_sorted = df_restructured_concat.sort_values(by=['time:timestamp', 'logIndex'])
    #mask for M2
    mask_new_referral = df_sorted['concept:name'].str.contains("New User-Place Matrix-2 : Place")
    mask_reinvest = df_sorted['concept:name'].str.contains("Reinvest Matrix-2")
    df_return = df_sorted[mask_new_referral | mask_reinvest]
    df_return = df_return.fillna('')
    
    df_return.reset_index(inplace=True)
    df_return.drop(columns=["index"], inplace=True)
    return df_return

df_top_m2 = log_cutter_restructured(pyramide_top)
df_center_top_m2 = log_cutter_restructured(pyramide_center_top)
df_bottom_m2 = log_cutter_restructured(pyramide_bottom)

from pm4py.objects.conversion.log import converter as log_converter
log_top_m2 = log_converter.apply(df_top_m2)
log_center_top_m2 = log_converter.apply(df_center_top_m2)
log_bottom_m2 = log_converter.apply(df_bottom_m2)

In [20]:
# modify PM4Py to check directly-follows relation for TWO activities

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify -> THIS VERSION IS MODIFIED
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from enum import Enum

from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.obj import EventLog
from pm4py.util import exec_utils
from pm4py.util.constants import PARAMETER_CONSTANT_ATTRIBUTE_KEY, PARAMETER_CONSTANT_RESOURCE_KEY, \
    PARAMETER_CONSTANT_TIMESTAMP_KEY
from pm4py.util.xes_constants import DEFAULT_NAME_KEY, DEFAULT_RESOURCE_KEY, DEFAULT_TIMESTAMP_KEY
import deprecation

from typing import Optional, Dict, Any, Union, Tuple, List
from pm4py.objects.log.obj import EventLog, EventStream, Trace


class Parameters(Enum):
    ATTRIBUTE_KEY = PARAMETER_CONSTANT_ATTRIBUTE_KEY
    TIMESTAMP_KEY = PARAMETER_CONSTANT_TIMESTAMP_KEY
    RESOURCE_KEY = PARAMETER_CONSTANT_RESOURCE_KEY
    POSITIVE = "positive"
    ENABLE_TIMESTAMP = "enable_timestamp"
    TIMESTAMP_DIFF_BOUNDARIES = "timestamp_diff_boundaries"


POSITIVE = Parameters.POSITIVE
ENABLE_TIMESTAMP = Parameters.ENABLE_TIMESTAMP
TIMESTAMP_DIFF_BOUNDARIES = Parameters.TIMESTAMP_DIFF_BOUNDARIES


def timestamp_list_is_ge(a, b):
    for i in range(len(a)):
        if a[i] < b[i][0]:
            return False
    return True


def timestamp_list_is_le(a, b):
    for i in range(len(a)):
        if a[i] > b[i][1]:
            return False
    return True


#@deprecation.deprecated('2.2.6', '3.0.0', 'please use pm4py.algo.filtering.log.ltl.ltl_checker.eventually_follows')



def A_next_B(log: EventLog, A: str, B: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> EventLog:
    """
    Applies the A next B next C rule

    Parameters
    ------------
    log
        Log
    A
        A attribute value
    B
        B attribute value
    C
        C attribute value
    parameters
        Parameters of the algorithm, including the attribute key and the positive parameter:
        - If True, returns all the cases containing A, B and C and in which A was directly followed by B and B was directly followed by C
        - If False, returns all the cases not containing A or B or C, or in which none instance of A was directly
        followed by an instance of B and B was directly followed by C

    Returns
    ------------
    filtered_log
        Filtered log
    """
    if parameters is None:
        parameters = {}

    log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)

    attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
    positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)

    new_log = EventLog(list(), attributes=log.attributes, extensions=log.extensions, classifiers=log.classifiers,
                       omni_present=log.omni_present, properties=log.properties)

    for trace in log:
        found = True # here: found == conforming

        # Check occurrences of A. Are they all As followed directly by B?
        for i in range(len(trace) - 1):
            if trace[i].get(attribute_key) == A:
                if trace[i + 1].get(attribute_key) != B:
                    found = False
                    break
        
        if found:
            if positive:
                new_log.append(trace)
        elif not positive:
            new_log.append(trace)
        
    return new_log

In [21]:
# compute diversions

A = "New User-Place Matrix-2 : Place 6"
B = "Reinvest Matrix-2"

def compute_diversions(log):
    new_log = A_next_B(log, A, B)
    n_diversions = len(log)-len(new_log)
    return n_diversions

n_diversions_top = compute_diversions(log_top_m2)
n_diversions_center_top = compute_diversions(log_center_top_m2)
n_diversions_bottom = compute_diversions(log_bottom_m2)

print("Number non-conforming cases top: " + str(n_diversions_top))
print("Number non-conforming cases center top: " + str(n_diversions_center_top))
print("Number non-conforming cases bottom: " + str(n_diversions_bottom))

Number non-conforming cases top: 822
Number non-conforming cases center top: 2
Number non-conforming cases bottom: 274
