In [1]:
import numpy as np
import pandas as pd
import pm4py 
import os
import sys
from copy import deepcopy

In [2]:
data_path = r'D:\VS Code Projects\Seq-xAI\data\BPI Challenge 2017.xes'
data = pm4py.read_xes(data_path)

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

In [None]:
data['Action'].value_counts(), data['EventOrigin'].value_counts()

(Action
 statechange    358940
 Obtained       255387
 Created        223608
 Released       215402
 Deleted        148930
 Name: count, dtype: int64,
 EventOrigin
 Workflow       768823
 Application    239595
 Offer          193849
 Name: count, dtype: int64)

: 

In [42]:
data

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Deleted,User_1,W_Call after offers,Workflow,Workitem_1817549786,ate_abort,2017-01-06 06:33:02.212000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202263,Created,User_1,W_Call after offers,Workflow,Workitem_363876066,schedule,2017-01-06 06:33:02.221000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202264,statechange,User_28,A_Cancelled,Application,ApplState_1869071797,complete,2017-01-16 09:51:21.114000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,
1202265,statechange,User_28,O_Cancelled,Offer,OfferState_420066181,complete,2017-01-16 09:51:21.139000+00:00,Home improvement,New credit,Application_1350494635,20000.0,,,,,,,,Offer_1580299144


In [3]:
case_id_col = "case:concept:name"
activity_col = "concept:name"
resource_col = 'org:resource'
timestamp_col = 'time:timestamp'
label_col = "label"
pos_label = "deviant"
neg_label = "regular"

# These will be the targets of the classification task
relevant_offer_events = ["O_Cancelled", "O_Accepted", "O_Refused"]

In [4]:
resource_freq_threshold = 10
max_category_levels = 10

In [48]:
# features for classifier
# Categorical features 
dynamic_cat_cols = [activity_col, resource_col, 'Action', 'CreditScore', 'EventOrigin', 'lifecycle:transition',
                   "Accepted", "Selected"] # i.e. event attributes
static_cat_cols = ['case:ApplicationType', 'case:LoanGoal'] # i.e. case attributes that are known from the start

# Numeric features
dynamic_num_cols = ['FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount',
                   "timesincelastevent", "timesincecasestart", "timesincemidnight", "event_nr", "month", "weekday", "hour",
                    "open_cases"]
static_num_cols = ['case:RequestedAmount']

static_cols = static_cat_cols + static_num_cols + [case_id_col, label_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [6]:
encoding_dict = {
    'agg' : ['static', 'agg'],
    'index' : ['static', 'index']
    }

In [7]:
bucketing = 'single'
encoding = 'agg'
method_name = ('_').join([bucketing, encoding])

In [15]:
def extract_timestamp_features(group):
    
    group = group.sort_values(timestamp_col, ascending=False, kind='mergesort')
    
    tmp = group[timestamp_col] - group[timestamp_col].shift(-1)
    # tmp = tmp.fillna(0)
    group["timesincelastevent"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    tmp = group[timestamp_col] - group[timestamp_col].iloc[-1]
    tmp = tmp.fillna(0)
    group["timesincecasestart"] = tmp.apply(lambda x: float(x / np.timedelta64(1, 'm'))) # m is for minutes

    group = group.sort_values(timestamp_col, ascending=True, kind='mergesort')
    group["event_nr"] = range(1, len(group) + 1)
    
    return group
    

def get_open_cases(date_):
    return sum((dt_first_last_timestamps["start_time"] <= date_) & (dt_first_last_timestamps["end_time"] > date_))



In [9]:
original_data = deepcopy(data)

In [10]:
# add event duration
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data["timesincemidnight"] = data[timestamp_col].dt.hour * 60 + data[timestamp_col].dt.minute
data["month"] = data[timestamp_col].dt.month
data["weekday"] = data[timestamp_col].dt.weekday
data["hour"] = data[timestamp_col].dt.hour


In [12]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [13]:
# add features extracted from timestamp
print("Extracting timestamp features...")
sys.stdout.flush()
data = data.groupby(case_id_col, as_index=False).progress_apply(extract_timestamp_features)

Extracting timestamp features...


  0%|          | 0/31509 [00:00<?, ?it/s]

In [14]:
# add inter-case features
print("Extracting open cases...")
sys.stdout.flush()
data = data.sort_values([timestamp_col], ascending=True, kind='mergesort')
dt_first_last_timestamps = data.groupby(case_id_col)[timestamp_col].agg(["min", "max"]) # [min, max]


Extracting open cases...


In [None]:
dt_first_last_timestamps.columns = ["start_time", "end_time"]
data["open_cases"] = data[timestamp_col].progress_apply(get_open_cases)

  0%|          | 0/1202267 [00:00<?, ?it/s]

In [20]:
# assign class labels
print("Assigning class labels...")
sys.stdout.flush()
last_o_events = data[data.EventOrigin == "Offer"].sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col).last()[activity_col]
last_o_events = pd.DataFrame(last_o_events)
last_o_events.columns = ["last_o_activity"]
data = data.merge(last_o_events, left_on=case_id_col, right_index=True)
data = data[data.last_o_activity.isin(relevant_offer_events)]

Assigning class labels...


In [None]:
import pickle 

with open('D:\VS Code Projects\Seq-xAI\data\merged_bpic2017.pkl', 'wb') as f:
    pickle.dump(data, f)

In [27]:
output_data_folder = 'D:\VS Code Projects\Seq-xAI\data'
filename = 'bpic2017'

In [52]:
for activity in relevant_offer_events:
    print("Finishing dataset for activity ", activity)
    sys.stdout.flush()
    dt_labeled = data.copy()
    dt_labeled[label_col] = neg_label
    dt_labeled.loc[dt_labeled["last_o_activity"] == activity, label_col] = pos_label

    dt_labeled = dt_labeled[static_cols + dynamic_cols]

    # impute missing values
    grouped = dt_labeled.sort_values(timestamp_col, ascending=True, kind='mergesort').groupby(case_id_col)
    for col in static_cols + dynamic_cols:
        dt_labeled[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))

    dt_labeled[cat_cols] = dt_labeled[cat_cols].fillna('missing')
    dt_labeled = dt_labeled.fillna(0)

    # set infrequent factor levels to "other"
    for col in cat_cols:
        if col == resource_col:
            counts = dt_labeled[col].value_counts()
            mask = dt_labeled[col].isin(counts[counts >= resource_freq_threshold].index)
            dt_labeled.loc[~mask, col] = "other"
        elif col != activity_col:
            counts = dt_labeled[col].value_counts()
            mask = dt_labeled[col].isin(counts.index[max_category_levels:])
            dt_labeled.loc[mask, col] = "other"

    dt_labeled.to_csv(os.path.join(output_data_folder, "%s_%s.csv" % (filename[:-4], activity)), sep=";", index=False)


Finishing dataset for activity  O_Cancelled


  dt_labeled[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))


Finishing dataset for activity  O_Accepted


  dt_labeled[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))


Finishing dataset for activity  O_Refused


  dt_labeled[col] = grouped[col].transform(lambda grp: grp.fillna(method='ffill'))
