In [1]:
import os
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
from datetime import datetime
from utils.filtering import generate_system

org_path = os.getcwd()
input_path = os.sep.join([str(org_path), "input"])

from patterns.FormBased import FormBased
from patterns.CollateralEvent import CollateralEvent
from patterns.ScatteredCase import ScatteredCase
from patterns.PollutedLabel import PollutedLabel
from patterns.ScatteredEvent import ScatteredEvent
from patterns.SynonymousLabel import SynonymousLabel
from patterns.HomonymousLabel import HomonymousLabel
from patterns.ElusiveCase import ElusiveCase
from patterns.UnanchoredEvent import UnanchoredEvent
from patterns.DistortedLabel import DistortedLabel
from patterns.InadvertentTimeTravel import InadvertentTimeTravel

### 0. Import and preprocess an event log

In [2]:
dat = 'credit-card-new.csv'

event_log = pd.read_csv(input_path+ '\\' + dat)
if sum( [x in dat for x in ['Small', 'Medium', 'Large', 'Huge', 'Wide']]) > 0:
    extracted_data = event_log[['Case', 'Activity', 'Timestamp']]
    form = "%Y/%m/%d %H:%M:%S.%f"    
elif sum( [x in dat for x in ['credit-card-new']]) > 0:
    extracted_data = event_log[['Case ID', 'Activity', 'Start Timestamp', 'Resource']]
    extracted_data.columns = ["Case", "Activity", "Timestamp", 'Resource']
    form = "%Y-%m-%d %H:%M:%S.%f" 
elif sum( [x in dat for x in ['mccloud', 'credit-card']]) > 0:
    extracted_data = event_log[['Case', 'Activity', 'Timestamp']]
    form = "%Y-%m-%d %H:%M:%S.%f" 

else:
    extracted_data = event_log[['Case.ID', 'Activity', 'Complete.Timestamp']]
    form = "%Y-%m-%d %H:%M:%S.%f"

if "Event" not in extracted_data.columns:
    extracted_data["Event"] = list(range(0,len(event_log.index)))

extracted_data = extracted_data.sort_values(["Case", "Timestamp", "Activity"],ascending=[True, True, True]) # Reorder rows
extracted_data.Case = extracted_data.Case.astype(str) 
time = extracted_data['Timestamp'].apply(lambda x: datetime.strptime(x, form))
extracted_data['Timestamp'] = time
EL = extracted_data.copy()
EL = EL.dropna(subset=['Case'])
del extracted_data['Event']
del EL['Event']

EL.head()

Unnamed: 0,Case,Activity,Timestamp,Resource
62310,0,Check for completeness,2023-09-29 09:00:00.000,Clerk-000001
62309,0,New online application received,2023-09-29 09:00:00.000,
62311,0,Perform checks,2023-09-29 09:08:36.418,Clerk-000003
62312,0,Make decision,2023-09-29 09:09:43.339,Manager-000001
62313,0,Notify accept,2023-09-29 09:16:37.243,Manager-000003


### 1. Form-based pattern

In [3]:
EL_polluted = FormBased(EL,
                    which = ['Make decision', 'Notify accept', 'Deliver card'], 
                    ratio= 0.7 , 
                    tstart = "2023-09-26 09:00:00.000",
                    tend = "2023-12-26 09:00:00.000",
                    DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                    case_id_key = "Case",
                    timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases containing the defined subseq  ['Make decision', 'Notify accept', 'Deliver card'] :  34
Filtering step 4 . The number of cases to be filtered by defined random portion:  24


In [4]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
3,0,Make decision,2023-09-29 09:09:43.339,Manager-000001,form-based events(2023-09-29 09:09:43.339000)
4,0,Notify accept,2023-09-29 09:09:43.339,Manager-000003,form-based events(2023-09-29 09:16:37.243000)
5,0,Deliver card,2023-09-29 09:09:43.339,Manager-000002,form-based events(2023-09-29 09:39:19.386000)
26,1113,Make decision,2023-12-04 07:15:07.899,Manager-000001,form-based events(2023-12-04 07:15:07.899000)


### 2. Collateral Event pattern

In [5]:
EL_polluted = CollateralEvent(EL, 
                    target_collats = "[Activity:'Make decision'>>('Make decision_signed1', 'Make decision_signed2')]",
                    ratio= 0.7 , 
                    timep = 1,
                    unit = 'sec',
                    tstart = "2023-09-26 09:00:00.000",
                    tend = "2023-12-26 09:00:00.000",
                    DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                    case_id_key = "Case",
                    timestamp_key = "Timestamp",
                    activity_key = "Activity")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24


In [6]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
4,0,Make decision_signed1,2023-09-29 09:09:44.239,Manager-000001,collateral events(Make decision)
5,0,Make decision_signed2,2023-09-29 09:09:44.239,Manager-000001,collateral events(Make decision)
13,1085,Make decision_signed1,2023-12-01 13:15:18.596,Manager-000001,collateral events(Make decision)
14,1085,Make decision_signed2,2023-12-01 13:15:18.996,Manager-000001,collateral events(Make decision)


### 3. Scattered Case pattern

In [7]:
ELsys = generate_system(EL, nsys = 10)

EL_polluted = ScatteredCase(ELsys,
                            log_name= 'EL_scattered2',
                            syslist = "[System:('System2', 'System3')]",
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            case_id_key = "Case",
                            timestamp_key = "Timestamp")

In [8]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,System,label
12949,0,Perform checks,2023-09-29 09:08:36.418,Clerk-000003,System0,"Scattered cases(['Check for completeness', 'Ne..."
12950,1,Perform checks,2023-09-29 09:10:28.617,Clerk-000004,System0,"Scattered cases(['Check for completeness', 'Ne..."
12951,2,Perform checks,2023-09-29 09:25:09.241,Clerk-000006,System0,"Scattered cases(['Check for completeness', 'Ne..."
12952,3,Perform checks,2023-09-29 11:26:10.630,Clerk-000002,System0,"Scattered cases(['Check for completeness', 'Ne..."


In [11]:
EL_polluted.loc[EL_polluted['label'] !=  "", 'label'].reset_index(drop=True)[0]

"homonymous Label(Activity:'Check for completeness')"

### 4. Scattered Event pattern

In [12]:

# Scattered Events
EL_polluted = ScatteredEvent(EL, 
                            target = "[Activity:'Make decision'>>('Make revision1', 'Make revision2')]",                                                   # "[Activity:('Make decision', 'Notify accept', 'Deliver card')]",
                            action = "[Resource]_[0-9:{2}][a-zA-Z:{5}]_[Timestamp*(%Y%m%d %H%M%S%f)]",
                            loc = "[Description:idx(-1)]",
                            Del = True, 
                            ratio = 0.7, 
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                            case_id_key = "Case",
                            activity_key = "Activity",
                            timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24


In [13]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,Description,label
3,0,Make decision,2023-09-29 09:09:43.339,Manager-000001,"[Clerk-000003_43IOuSD_20230929 091201307000, C...","Scattered Events(Scattered attr = ['Resource',..."
10,1085,Make decision,2023-12-01 13:15:17.696,Manager-000001,"[Manager-000001_95Pgvch_20231201 131733798000,...","Scattered Events(Scattered attr = ['Resource',..."
26,1113,Make decision,2023-12-04 07:15:07.899,Manager-000001,"[Manager-000003_66dYlNZ_20231204 073055915000,...","Scattered Events(Scattered attr = ['Resource',..."
33,1163,Make decision,2023-12-06 03:27:38.206,Manager-000001,"[nan_34XTwPq_20231206 032815018666, nan_20bayi...","Scattered Events(Scattered attr = ['Resource',..."


In [14]:
EL_polluted.loc[EL_polluted['label'] !=  "", 'label'].reset_index(drop=True)[0]

"Scattered Events(Scattered attr = ['Resource', 'Timestamp'], Activity:['Make revision1', 'Make revision2'])"

### 5. Polluted Label pattern

In [3]:
# Polluted Label
EL_polluted = PollutedLabel(EL, 
                            target = "[Activity:('Perform checks', 'Make decision')]",  # TBD: also numeric range
                            action = "[Activity]_[0-9:{2}][a-zA-Z:{5}]_[Timestamp*(%Y%m%d %H%M%S%f)]",
                            ratio = 0.7,
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                            case_id_key = "Case",
                            timestamp_key = "Timestamp")

Filtering step 1. The number of cases in the time interval (2023-09-26 09:00:00.000, 2023-12-26 09:00:00.000): 1505
Filtering step 2. The number of cases by declare rule: 34
Filtering step 3. The number of cases to be filtered by defined random portion: 24
1283
    Case                         Activity               Timestamp  \
92  1283           Check for completeness 2023-12-13 12:52:23.894   
93  1283  New online application received 2023-12-13 12:52:23.894   
94  1283                   Perform checks 2023-12-13 12:53:43.920   
95  1283                    Make decision 2023-12-13 13:08:36.949   
96  1283                    Notify accept 2023-12-13 13:20:52.505   
97  1283                     Deliver card 2023-12-13 14:20:03.871   
98  1283                     EVENT 13 END 2023-12-13 14:30:23.750   

          Resource case:concept:name label  
92    Clerk-000001              1283        
93             NaN              1283        
94    Clerk-000002              1283        
95  M

In [4]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,case:concept:name,label
2,0,Perform checks_98oFGFE_20230929 090836418000,2023-09-29 09:08:36.418,Clerk-000003,0,polluted Label(Activity:'Perform checks')
3,0,Make decision_22SpmXv_20230929 090943339000,2023-09-29 09:09:43.339,Manager-000001,0,polluted Label(Activity:'Make decision')
9,1085,Perform checks_47abVfs_20231201 131322797000,2023-12-01 13:13:22.797,Clerk-000004,1085,polluted Label(Activity:'Perform checks')
10,1085,Make decision_81kUsbR_20231201 131517696000,2023-12-01 13:15:17.696,Manager-000001,1085,polluted Label(Activity:'Make decision')


### 6. Synonymous Label pattern

In [17]:
EL_polluted = SynonymousLabel(EL, 
                            target = "[Activity:('Perform checks')]",
                            syns = ["Perform checks - Dep1",  "Perform checks - Dep2", "Perform checks - Dep3"],
                            prob = [0.1, 0.6, 0.3],
                            ratio = 0.7,
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                            case_id_key = "Case",
                            timestamp_key = "Timestamp")


Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24


In [18]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
58807,97,Perform checks - Dep3,2023-10-04 14:33:28.710,Clerk-000005,synonymous label('Perform checks')
23902,510,Perform checks - Dep1,2023-10-30 01:22:06.896,Clerk-000002,synonymous label('Perform checks')
10406,556,Perform checks - Dep2,2023-11-01 01:51:12.379,Clerk-000005,synonymous label('Perform checks')
53521,575,Perform checks - Dep3,2023-11-01 20:50:44.810,Clerk-000005,synonymous label('Perform checks')


### 7. Homonymous Label pattern

In [19]:
# Homonymous Label
EL_polluted = HomonymousLabel(EL,
                            target = "[Activity:('Perform checks', 'Check for completeness')]",
                            hlabel = "Check:homonymous",
                            ratio = 0.7,
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                            case_id_key = "Case",
                            timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24


In [20]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
61800,154,Check:homonymous,2023-10-09 02:50:57.264,Clerk-000005,homonymous Label(Activity:'Check for completen...
61803,154,Check:homonymous,2023-10-09 02:58:29.207,Clerk-000001,homonymous Label(Activity:'Check for completen...
61806,154,Check:homonymous,2023-10-09 03:32:32.364,Clerk-000001,homonymous Label(Activity:'Check for completen...
61807,154,Check:homonymous,2023-10-09 03:56:24.704,Clerk-000002,homonymous Label(Activity:'Perform checks')


### 8. Elusive Case pattern

In [21]:
# Elusive Case 
EL_polluted = ElusiveCase(EL,
                        method = 'Variant', # 'Variant', 'KMeans'
                        gnum = 4,     # gnum = 4,   n_clusters = 100
                        ratio = 0.7,
                        tstart = "2023-09-26 09:00:00.000",
                        tend = "2023-12-26 09:00:00.000",
                        DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                        case_id_key = "Case",
                        activity_key = "Activity",
                        timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24
Generating draft IDs... (the below count is based on variant-level)


100%|██████████| 12/12 [00:00<00:00, 1089.97it/s]


In [22]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Activity,Timestamp,Resource,draft_ID,label
0,Check for completeness,2023-09-29 09:00:00.000,Clerk-000001,draft_0,0
1,New online application received,2023-09-29 09:00:00.000,,draft_0,0
2,Perform checks,2023-09-29 09:08:36.418,Clerk-000003,draft_0,0
3,Make decision,2023-09-29 09:09:43.339,Manager-000001,draft_0,0


In [23]:
# Elusive Case with KMeans
EL_polluted = ElusiveCase(EL,
                        method = 'KMeans', # 'Variant', 'KMeans'
                        gnum = 4,     # gnum = 4,   gnum = 100 (for KMeans)
                        ratio = 0.7,
                        tstart = "2023-09-26 09:00:00.000",
                        tend = "2023-12-26 09:00:00.000",
                        DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                        case_id_key = "Case",
                        activity_key = "Activity",
                        timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24
The size of each cluster is:
draft_ID
draft_1    12
draft_2    10
draft_0     1
draft_3     1
Name: count, dtype: int64


In [24]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Activity,Timestamp,Resource,draft_ID,label
0,Check for completeness,2023-09-29 09:00:00.000,Clerk-000001,draft_1,0
1,New online application received,2023-09-29 09:00:00.000,,draft_1,0
2,Perform checks,2023-09-29 09:08:36.418,Clerk-000003,draft_1,0
3,Make decision,2023-09-29 09:09:43.339,Manager-000001,draft_1,0


### 9. Unanchored Event pattern

In [25]:
ELsys = generate_system(EL, nsys =10)
# Unanchored Event
EL_polluted = UnanchoredEvent(ELsys, 
                            syslist = "[System:('System1', 'System3')]",
                            TimeFormat = "%Y/%m/%d %H:%M:%S.%f", 
                            tstart = "2023-09-26 09:00:00.000",
                            tend = "2023-12-26 09:00:00.000",
                            DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                            case_id_key = "Case",
                            timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34


In [26]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,System,label
0,0,Check for completeness,2023/09/29 09:00:00.000000,Clerk-000001,System3,unanchored event(2023-09-29 09:00:00)
248,97,Check for completeness,2023/10/04 14:18:44.672000,Clerk-000003,System3,unanchored event(2023-10-04 14:18:44.672000)
425,154,Check for completeness,2023/10/09 02:50:57.264000,Clerk-000005,System3,unanchored event(2023-10-09 02:50:57.264000)
426,154,Check for completeness,2023/10/09 02:58:29.207000,Clerk-000001,System3,unanchored event(2023-10-09 02:58:29.207000)


### 10. Distored Label pattern

In [3]:
# Distorted Label
EL_polluted = DistortedLabel(EL, 
                          who = "[Resource:(random{m=0.05, s=0.01})]",    #[Resource:('Manager-000001)]
                          distortion = "[Activity:random(Skip, Insert, Interchange, UpLow, Proximity)]",  # "[Activity:({'Check for completeness':'check for completeness'})]",
                          ratio = 0.7,
                          tstart = "2023-09-26 09:00:00.000",
                          tend = "2023-12-26 09:00:00.000",
                          DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                          case_id_key = "Case",
                          timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24
Total number of events with resource's mistake:  19


In [4]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
0,1203,Request nifo,2023-12-08 05:44:54.933,Manager-000006,distorted label(Interchange)
1,1203,Delive card,2023-12-08 06:29:52.572,Manager-000006,distorted label(Skip)
2,1205,Ceck for completeness,2023-12-08 06:46:49.155,Clerk-000004,distorted label(Skip)
3,1205,info receifed,2023-12-08 07:02:47.763,,distorted label(Proximity)


In [3]:
# Distorted Label
EL_polluted = DistortedLabel(EL, 
                          distortion = "[Activity:({'Check for completeness':'check for completeness'})]",
                          prob = 0.3,
                          ratio = 0.7,
                          tstart = "2023-09-26 09:00:00.000",
                          tend = "2023-12-26 09:00:00.000",
                          DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                          case_id_key = "Case",
                          timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2023-12-26 09:00:00.000 ):  1505
Filtering step 2 . The number of cases by declare rule:  34
Filtering step 3 . The number of cases to be filtered by defined random portion:  24


In [4]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
7,801,check for completeness,2023-11-15 22:18:04.905,Clerk-000003,distorted label(Check for completeness:check f...
8,701,check for completeness,2023-11-09 13:11:50.880,Clerk-000001,distorted label(Check for completeness:check f...
9,283,check for completeness,2023-10-13 15:49:53.049,Clerk-000006,distorted label(Check for completeness:check f...
10,695,check for completeness,2023-11-09 05:54:53.445,Clerk-000003,distorted label(Check for completeness:check f...


In [5]:
EL_polluted.loc[EL_polluted['label'] !=  "", 'label'].reset_index(drop=True)[0]

'distorted label(Check for completeness:check for completeness)'

### 11. Inadvertent Time Travel pattern

In [29]:
# Distorted Label
EL_polluted = InadvertentTimeTravel(EL, 
                          target = "[Activity:('Perform checks', 'Check for completeness')]",
                          tunit = "year",    #day, month, year
                          prob_func = "poisson",  # poisson, exponential
                          tstart = "2023-09-26 09:00:00.000",
                          tend = "2024-02-26 09:00:00.000",
                          DecConstraint = "Chain Response[Make decision, Notify accept] |A.Resource is Manager-000001 |T.Resource is Manager-000003 |", 
                          ratio = 1.0,
                          case_id_key = "Case",
                          timestamp_key = "Timestamp")

Filtering step 1 . The number of cases in the time interval ( 2023-09-26 09:00:00.000 , 2024-02-26 09:00:00.000 ):  2545
Filtering step 2 . The number of cases by declare rule:  54
Filtering step 3 . The number of cases to be filtered by defined random portion:  54


In [30]:
EL_polluted[EL_polluted['label'] !=  ""].head(4)

Unnamed: 0,Case,Activity,Timestamp,Resource,label
37091,1636,Perform checks,2023-01-03 01:24:39.729,Clerk-000005,inadvertent time(year:2024-01-03 01:24:39.729000)
