In [1]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time

pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
#name = 'bpi_2012'
name = 'bpi_2013'
#name = 'helpdesk'
args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
}

args = argparse.Namespace(**args)

In [4]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.input_dir):
    os.makedirs(args.input_dir)

In [5]:
sys.path.insert(0, './../utils/')
from utils import *

# Load data

In [6]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
if name == 'helpdesk':
    data = pd.read_csv(args.data_dir + args.data_file)
else:
    data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
    data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [7]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42
1,1,Accepted-In Progress,2012-03-15 19:53:52
2,1,Accepted-Assigned,2012-03-15 19:56:17
3,1,Accepted-In Progress,2012-03-15 20:09:05
4,1,Completed-Closed,2012-03-15 20:11:33


# Split data into train/val/test

In [8]:
groupByCase = data.groupby(['CaseID'])

# Split: 70% train, 20% validate, 20% test
train_case_num = int(len(groupByCase)*args.train_pct)
val_case_num = int(len(groupByCase)*args.val_pct)
test_case_num = len(groupByCase) - train_case_num - val_case_num

In [9]:
data_train = pd.DataFrame(columns=list(data))
data_val = pd.DataFrame(columns=list(data))
data_test = pd.DataFrame(columns=list(data))

for caseid, data_case in groupByCase:
    if caseid <= train_case_num:
        data_train = data_train.append(data_case)
    elif train_case_num < caseid <= (train_case_num+val_case_num):
        data_val = data_val.append(data_case)
    else:
        data_test = data_test.append(data_case)

In [10]:
print('Checking shapes of sub data: ', data.shape[0] == data_train.shape[0] + data_val.shape[0] + data_test.shape[0])

Checking shapes of sub data:  True


In [11]:
data_test.shape

(1071, 3)

In [12]:
train_row_num = data_train.shape[0]
val_row_num = data_val.shape[0]
test_row_num = data_test.shape[0]

# Introduce anomalous data

This step will be modified later

In [13]:
anomaly_num_val = int(data_val.shape[0]*(data_val.shape[1]-1)*args.anomaly_pct/2)
anomalous_act_num_val = int(anomaly_num_val/2)
anomalous_time_num_val = anomaly_num_val - anomalous_act_num_val


anomaly_num_test = int(data_test.shape[0]*(data_test.shape[1]-1)*args.anomaly_pct/2)
anomalous_act_num_test = int(anomaly_num_test/2)
anomalous_time_num_test = anomaly_num_test - anomalous_act_num_test

print('Number of anomalous values in validate set: {}'.format(anomaly_num_val))
print('Number of anomalous activities in validate set: {}'.format(anomalous_act_num_val))
print('Number of anomalous time in validate set: {}'.format(anomalous_time_num_val))
print('\n')
print('Number of anomalous values in test set: {}'.format(anomaly_num_test))
print('Number of anomalous activities in test set: {}'.format(anomalous_act_num_test))
print('Number of anomalous time in test set: {}'.format(anomalous_time_num_test))

Number of anomalous values in validate set: 109
Number of anomalous activities in validate set: 54
Number of anomalous time in validate set: 55


Number of anomalous values in test set: 107
Number of anomalous activities in test set: 53
Number of anomalous time in test set: 54


In [14]:
data_val['ActivityLabel'] = 0
data_test['ActivityLabel'] = 0

data_val['TimeLabel'] = 0
data_test['TimeLabel'] = 0

In [15]:
data_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,ActivityLabel,TimeLabel
4493,893,Queued-Awaiting Assignment,2012-01-14 04:03:40,0,0
4494,893,Accepted-In Progress,2012-01-31 22:16:55,0,0
4495,893,Queued-Awaiting Assignment,2012-01-31 22:29:08,0,0
4496,893,Accepted-In Progress,2012-01-31 23:02:59,0,0
4497,893,Completed-Closed,2012-02-18 04:13:20,0,0


In [56]:
data_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,ActivityLabel,TimeLabel
5589,1190,Accepted-In Progress,2012-03-07 22:24:22,0,0
5590,1190,Completed-Closed,2012-04-11 21:36:11,0,0
5591,1191,Accepted-In Progress,2012-03-07 22:26:08,0,0
5592,1191,Completed-Closed,2012-04-25 16:42:09,0,0
5593,1192,Accepted-In Progress,2012-03-08 00:06:16,0,0


## Activity

**Mutation:**
- Swap 2 activities within a case
- Duplicate 1 activity

In [16]:
anomaly_val = data_val.copy()
i = 0

while i < anomalous_act_num_val:
    row = np.random.randint(0, anomaly_val.shape[0])
    if anomaly_val.iloc[row, anomaly_val.columns.get_loc('ActivityLabel')] == 0:
        anomaly_val.iloc[row, anomaly_val.columns.get_loc('ActivityLabel')] = 1
        i+=1

In [17]:
anomaly_test = data_test.copy()
i = 0

while i < anomalous_act_num_test:
    row = np.random.randint(0, anomaly_test.shape[0])
    if anomaly_test.iloc[row, anomaly_test.columns.get_loc('ActivityLabel')] == 0:
        anomaly_test.iloc[row, anomaly_test.columns.get_loc('ActivityLabel')] = 1
        i+=1

In [18]:
anomaly_val['ActivityLabel'].sum(), anomaly_test['ActivityLabel'].sum()

(54, 53)

## Time

**Mutation:**
- Inaccurate time
- Extreme value

In [19]:
i = 0

while i < anomalous_time_num_val:
    row = np.random.randint(0, anomaly_val.shape[0])
    if anomaly_val.iloc[row, anomaly_val.columns.get_loc('TimeLabel')] == 0:
        anomaly_val.iloc[row, anomaly_val.columns.get_loc('TimeLabel')] = 1
        i+=1

In [20]:
i = 0

while i < anomalous_time_num_test:
    row = np.random.randint(0, anomaly_test.shape[0])
    if anomaly_test.iloc[row, anomaly_test.columns.get_loc('TimeLabel')] == 0:
        anomaly_test.iloc[row, anomaly_test.columns.get_loc('TimeLabel')] = 1
        i+=1

In [21]:
anomaly_val['TimeLabel'].sum(), anomaly_test['TimeLabel'].sum()

(55, 54)

# Prepare input

In [22]:
data_train.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42
1,1,Accepted-In Progress,2012-03-15 19:53:52
2,1,Accepted-Assigned,2012-03-15 19:56:17
3,1,Accepted-In Progress,2012-03-15 20:09:05
4,1,Completed-Closed,2012-03-15 20:11:33


In [23]:
anomaly_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,ActivityLabel,TimeLabel
4493,893,Queued-Awaiting Assignment,2012-01-14 04:03:40,0,0
4494,893,Accepted-In Progress,2012-01-31 22:16:55,0,0
4495,893,Queued-Awaiting Assignment,2012-01-31 22:29:08,0,0
4496,893,Accepted-In Progress,2012-01-31 23:02:59,0,0
4497,893,Completed-Closed,2012-02-18 04:13:20,0,0


In [24]:
anomaly_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,ActivityLabel,TimeLabel
5589,1190,Accepted-In Progress,2012-03-07 22:24:22,0,1
5590,1190,Completed-Closed,2012-04-11 21:36:11,0,0
5591,1191,Accepted-In Progress,2012-03-07 22:26:08,0,0
5592,1191,Completed-Closed,2012-04-25 16:42:09,0,0
5593,1192,Accepted-In Progress,2012-03-08 00:06:16,0,0


## Labels

In [25]:
activity_label_val = anomaly_val['ActivityLabel']
time_label_val = anomaly_val['TimeLabel']

activity_label_test = anomaly_test['ActivityLabel']
time_label_test = anomaly_test['TimeLabel']

In [26]:
anomaly_val = anomaly_val.drop(['ActivityLabel', 'TimeLabel'],axis=1)
anomaly_test = anomaly_test.drop(['ActivityLabel', 'TimeLabel'],axis=1)

In [27]:
combined_data = data_train.append([anomaly_val, anomaly_test])

In [28]:
combined_data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42
1,1,Accepted-In Progress,2012-03-15 19:53:52
2,1,Accepted-Assigned,2012-03-15 19:56:17
3,1,Accepted-In Progress,2012-03-15 20:09:05
4,1,Completed-Closed,2012-03-15 20:11:33


## Activity

In [29]:
cat_var = ['Activity']

In [30]:
enc_data = OHE(combined_data, cat_var)

In [31]:
enc_data.head()

Unnamed: 0,CaseID,CompleteTimestamp,Activity_Accepted-Assigned,Activity_Accepted-In Progress,Activity_Accepted-Wait,Activity_Completed-Cancelled,Activity_Completed-Closed,Activity_Queued-Awaiting Assignment,Activity_Unmatched-Unmatched
0,1,2006-01-11 23:49:42,0,0,0,0,0,1,0
1,1,2012-03-15 19:53:52,0,1,0,0,0,0,0
2,1,2012-03-15 19:56:17,1,0,0,0,0,0,0
3,1,2012-03-15 20:09:05,0,1,0,0,0,0,0
4,1,2012-03-15 20:11:33,0,0,0,0,1,0,0


In [32]:
activity_data = enc_data.drop(['CaseID', 'CompleteTimestamp'], axis=1)

## Time

### Get time

In [33]:
time_data = enc_data[['CaseID', 'CompleteTimestamp']]

In [34]:
time_data.head()

Unnamed: 0,CaseID,CompleteTimestamp
0,1,2006-01-11 23:49:42
1,1,2012-03-15 19:53:52
2,1,2012-03-15 19:56:17
3,1,2012-03-15 20:09:05
4,1,2012-03-15 20:11:33


In [35]:
groupByCase = time_data.groupby(['CaseID'])

In [36]:
temp_time = pd.DataFrame(columns=list(time_data)+['Duration'])
#Loop all group and apply above functions
for case, group in groupByCase:
    group = calculateDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)    
    temp_time = temp_time.append(group)

In [37]:
temp_time.head()

Unnamed: 0,CaseID,CompleteTimestamp,Duration
0,1,2006-01-11 23:49:42,0.0
1,1,2012-03-15 19:53:52,194817850.0
2,1,2012-03-15 19:56:17,145.0
3,1,2012-03-15 20:09:05,768.0
4,1,2012-03-15 20:11:33,148.0


### Get divisor

In [38]:
min_value = np.min(temp_time.iloc[:train_row_num, 2])
max_value = np.max(temp_time.iloc[:train_row_num, 2])

In [39]:
print('Min used for normalization: {}'.format(min_value))
print('Max used for normalization: {}'.format(max_value))

Min used for normalization: 0.0
Max used for normalization: 194817850.0


### Normalize time

In [40]:
temp_time['NormalizedDuration'] = temp_time['Duration'].apply(lambda x: (x-min_value)/(max_value-min_value))

In [41]:
temp_time.head()

Unnamed: 0,CaseID,CompleteTimestamp,Duration,NormalizedDuration
0,1,2006-01-11 23:49:42,0.0,0.0
1,1,2012-03-15 19:53:52,194817850.0,1.0
2,1,2012-03-15 19:56:17,145.0,7.44285e-07
3,1,2012-03-15 20:09:05,768.0,3.942144e-06
4,1,2012-03-15 20:11:33,148.0,7.59684e-07


In [42]:
temp_time = temp_time.drop(['CompleteTimestamp', 'Duration'], axis=1)

## 0-padding

In [43]:
full_data = pd.concat([temp_time, activity_data], axis=1)

In [44]:
full_data.head()

Unnamed: 0,CaseID,NormalizedDuration,Activity_Accepted-Assigned,Activity_Accepted-In Progress,Activity_Accepted-Wait,Activity_Completed-Cancelled,Activity_Completed-Closed,Activity_Queued-Awaiting Assignment,Activity_Unmatched-Unmatched
0,1,0.0,0,0,0,0,0,1,0
1,1,1.0,0,1,0,0,0,0,0
2,1,7.44285e-07,1,0,0,0,0,0,0
3,1,3.942144e-06,0,1,0,0,0,0,0
4,1,7.59684e-07,0,0,0,0,1,0,0


In [46]:
pad_index = full_data.copy()

In [47]:
pad_index[cols] = 1.0

In [48]:
pad_index.head()

Unnamed: 0,CaseID,NormalizedDuration,Activity_Accepted-Assigned,Activity_Accepted-In Progress,Activity_Accepted-Wait,Activity_Completed-Cancelled,Activity_Completed-Closed,Activity_Queued-Awaiting Assignment,Activity_Unmatched-Unmatched
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Vectorize

In [49]:
train_groupByCase = data_train.groupby(['CaseID'])

maxlen = findLongestLength(train_groupByCase)
print('Maxlen: ', maxlen)

Maxlen:  35


In [50]:
data_groupByCase = full_data.groupby(['CaseID'])
vectorized_data = getInput(data_groupByCase, cols, maxlen)

pad_index_groupByCase = pad_index.groupby(['CaseID'])
vectorized_pad_index = getInput(pad_index_groupByCase, cols, maxlen)

# Split in to train/val/test

In [51]:
print('Shape of vectorized data: {}'.format(vectorized_data.shape))
print('Shape of vectorized pad index: {}'.format(vectorized_pad_index.shape))
print('\n')
print('Number of case for train: {}'.format(train_case_num))
print('Number of case for validate: {}'.format(val_case_num))
print('Number of case for test: {}'.format(test_case_num))

Shape of vectorized data: (1487, 35, 8)
Shape of vectorized pad index: (1487, 35, 8)


Number of case for train: 892
Number of case for validate: 297
Number of case for test: 298


In [52]:
input_train = vectorized_data[0:train_case_num]
input_val = vectorized_data[train_case_num:train_case_num+val_case_num]
input_test = vectorized_data[-test_case_num:]

pad_index_train = vectorized_pad_index[0:train_case_num]
pad_index_val = vectorized_pad_index[train_case_num:train_case_num+val_case_num]
pad_index_test = vectorized_pad_index[-test_case_num:]

In [53]:
print('Check shape of input for training: {}'.format(input_train.shape[0]==train_case_num))
print('Check shape of input for validation: {}'.format(input_val.shape[0]==val_case_num))
print('Check shape of input for testing: {}'.format(input_test.shape[0]==test_case_num))

Check shape of input for training: True
Check shape of input for validation: True
Check shape of input for testing: True


# Save data

In [58]:
preprocessed_data_name = os.path.join(args.input_dir, 'preprocessed_data_{}.pkl'.format(args.anomaly_pct))
with open(preprocessed_data_name, 'wb') as f:
    pickle.dump(input_train, f, protocol=2)
    pickle.dump(input_val, f, protocol=2)
    pickle.dump(input_test, f, protocol=2)
    pickle.dump(pad_index_train, f, protocol=2)
    pickle.dump(pad_index_val, f, protocol=2)
    pickle.dump(pad_index_test, f, protocol=2)
    pickle.dump(activity_label_val, f, protocol=2)
    pickle.dump(activity_label_test, f, protocol=2)
    pickle.dump(time_label_val, f, protocol=2)
    pickle.dump(time_label_test, f, protocol=2)
    pickle.dump(train_case_num, f, protocol=2)
    pickle.dump(val_case_num, f, protocol=2)
    pickle.dump(test_case_num, f, protocol=2)
    pickle.dump(train_row_num, f, protocol=2)
    pickle.dump(val_row_num, f, protocol=2)
    pickle.dump(test_row_num, f, protocol=2)
    pickle.dump(min_value, f, protocol=2)
    pickle.dump(max_value, f, protocol=2)
    pickle.dump(cols, f, protocol=2)