In [1]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
name = 'bpi_2012'
#name = 'bpi_2013'

args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
    'scaler': 'standardization', 
}

args = argparse.Namespace(**args)

In [4]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.input_dir):
    os.makedirs(args.input_dir)

In [5]:
sys.path.insert(0, './../utils/')
from utils import *

# Load data

In [6]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [7]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42
1,1,Accepted-In Progress,2012-03-15 19:53:52
2,1,Accepted-Assigned,2012-03-15 19:56:17
3,1,Accepted-In Progress,2012-03-15 20:09:05
4,1,Completed-Closed,2012-03-15 20:11:33


In [8]:
#Calculate duration and cumulative duration
groupByCase = data.groupby(['CaseID'])
duration_df = pd.DataFrame(pd.DataFrame(columns=list(data)+['Duration', 'CumDuration']))
                           
for case, group in groupByCase:
    group = calculateDuration(group)
    group = calculateCumDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)
    group['CumDuration'] = group['CumDuration'].apply(convert2seconds)
    duration_df = duration_df.append(group)

In [9]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0
5,2,Accepted-In Progress,2006-11-07 18:00:36,0.0,0.0
6,2,Accepted-In Progress,2006-11-07 21:05:44,11108.0,11108.0
7,2,Accepted-Wait,2009-12-02 22:24:32,96859128.0,96870236.0
8,2,Accepted-In Progress,2011-09-03 14:09:09,55266277.0,152136513.0
9,2,Accepted-In Progress,2012-01-20 18:23:24,12024855.0,164161368.0


In [10]:
#get statistics storage for activity
groupByActivity = duration_df.groupby(['Activity'])
statistics_storage = {}

for act, act_data in groupByActivity:
    act_storage = {}
    act_storage[act] = {}
    mean_value = act_data['Duration'].mean()
    std_value = act_data['Duration'].std()
    act_storage[act]['mean'] = mean_value
    act_storage[act]['std'] = std_value
    statistics_storage.update(act_storage)

In [11]:
print('Descriptive statistics: \n{}'.format(statistics_storage))

Descriptive statistics: 
{'Completed-Cancelled': {'std': 48530.936786068058, 'mean': 28252.333333333332}, 'Accepted-Wait': {'std': 9675577.7800667603, 'mean': 3474376.5730550284}, 'Accepted-Assigned': {'std': 7115575.8771717735, 'mean': 2284207.5146579803}, 'Accepted-In Progress': {'std': 11479844.398664003, 'mean': 3862876.8098499672}, 'Unmatched-Unmatched': {'std': 1613426.676996954, 'mean': 659730.09999999998}, 'Queued-Awaiting Assignment': {'std': 4089422.3905510097, 'mean': 1000673.2914285715}, 'Completed-Closed': {'std': 9756199.0950356536, 'mean': 4487671.0242811497}}


In [12]:
act_list = data['Activity'].unique()
print('Activity: {}'.format(act_list))

Activity: ['Queued-Awaiting Assignment' 'Accepted-In Progress' 'Accepted-Assigned'
 'Completed-Closed' 'Accepted-Wait' 'Unmatched-Unmatched'
 'Completed-Cancelled']


# Introduce anomalous data

In [13]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42
1,1,Accepted-In Progress,2012-03-15 19:53:52
2,1,Accepted-Assigned,2012-03-15 19:56:17
3,1,Accepted-In Progress,2012-03-15 20:09:05
4,1,Completed-Closed,2012-03-15 20:11:33


In [14]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0
5,2,Accepted-In Progress,2006-11-07 18:00:36,0.0,0.0
6,2,Accepted-In Progress,2006-11-07 21:05:44,11108.0,11108.0
7,2,Accepted-Wait,2009-12-02 22:24:32,96859128.0,96870236.0
8,2,Accepted-In Progress,2011-09-03 14:09:09,55266277.0,152136513.0
9,2,Accepted-In Progress,2012-01-20 18:23:24,12024855.0,164161368.0


In [15]:
anomaly_num = int(data.shape[0]*(data.shape[1]-1)*args.anomaly_pct)
anomalous_act_num = int(anomaly_num/2)
anomalous_time_num = anomaly_num - anomalous_act_num

print('Number of anomalous values: {}'.format(anomaly_num))
print('Number of anomalous activities: {}'.format(anomalous_act_num))
print('Number of anomalous time: {}'.format(anomalous_time_num))

Number of anomalous values: 1332
Number of anomalous activities: 666
Number of anomalous time: 666


## Activity

**Mutation:**
- Replace an activity by another

In [16]:
temp_act_df = pd.DataFrame({'Activity': duration_df['Activity'].copy(),
                            'AnomalousActivity': duration_df['Activity'].copy(),
                            'ActivityLabel': 0})

In [17]:
temp_act_df.head()

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,Queued-Awaiting Assignment,0,Queued-Awaiting Assignment
1,Accepted-In Progress,0,Accepted-In Progress
2,Accepted-Assigned,0,Accepted-Assigned
3,Accepted-In Progress,0,Accepted-In Progress
4,Completed-Closed,0,Completed-Closed


In [18]:
anomalous_act_index = []

while len(anomalous_act_index) < anomalous_act_num:
    row = np.random.randint(0, temp_act_df.shape[0])
    idx = np.random.randint(0, len(act_list)-1)
    if row not in anomalous_act_index:
        anomalous_act_index.append(row)
        act = temp_act_df.loc[row, 'Activity']
        anomalous_act_list = [i for i in act_list if i != act]
        anomalous_act = anomalous_act_list[idx]                  
        temp_act_df.loc[row, 'AnomalousActivity'] = anomalous_act
        temp_act_df.loc[row, 'ActivityLabel'] = 1

In [19]:
temp_act_df.head(50)

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,Queued-Awaiting Assignment,0,Queued-Awaiting Assignment
1,Accepted-In Progress,0,Accepted-In Progress
2,Accepted-Assigned,0,Accepted-Assigned
3,Accepted-In Progress,0,Accepted-In Progress
4,Completed-Closed,0,Completed-Closed
5,Accepted-In Progress,0,Accepted-In Progress
6,Accepted-In Progress,0,Accepted-In Progress
7,Accepted-Wait,0,Accepted-Wait
8,Accepted-In Progress,0,Accepted-In Progress
9,Accepted-In Progress,0,Accepted-In Progress


```python
#swap 2 activity within a case
groupByCase = duration_df.groupby(['CaseID'])

anomalous_act_index = []
caseid_list = []
temp_df = duration_df.copy()
temp_df['AnomalousActivity'] = temp_df['Activity'].copy()
temp_df['ActivityLabel'] = 0

while len(anomalous_act_index) < anomalous_act_num:
    caseid = np.random.randint(1, len(groupByCase))
    if caseid not in caseid_list:
        group = groupByCase.get_group(caseid)
        row1 = np.random.randint(0, group.shape[0])
        row2 = np.random.randint(0, group.shape[0])
        index1 = group.index.values[row1]
        index2 = group.index.values[row2]
        act1 = duration_df['Activity'].iloc[index1]
        act2 = duration_df['Activity'].iloc[index2]
        if act1 != act2:
            anomalous_act_index.append(index1)
            anomalous_act_index.append(index2)
            temp_df['AnomalousActivity'].iloc[index1] = act2
            temp_df['AnomalousActivity'].iloc[index2] = act1
            temp_df['ActivityLabel'].iloc[index1] = 1
            temp_df['ActivityLabel'].iloc[index2] = 1
            
temp_act = temp_df[['AnomalousActivity', 'ActivityLabel']]
```

In [20]:
temp_act = temp_act_df[['AnomalousActivity', 'ActivityLabel']]

In [21]:
temp_act.head()

Unnamed: 0,AnomalousActivity,ActivityLabel
0,Queued-Awaiting Assignment,0
1,Accepted-In Progress,0
2,Accepted-Assigned,0
3,Accepted-In Progress,0
4,Completed-Closed,0


## Time

**Mutation:**
- Extreme duration

In [22]:
temp_time_df = duration_df.copy()
temp_time_df['AnomalousDuration'] = temp_time_df['Duration'].copy()
temp_time_df['TimeLabel'] = 0

In [23]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0.0,0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,194817850.0,0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,145.0,0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,768.0,0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,148.0,0


In [24]:
#get anomalous duration
anomalous_time_index = []

while len(anomalous_time_index) < anomalous_time_num:
    row = np.random.randint(0, temp_time_df.shape[0])
    if row not in anomalous_time_index:
        anomalous_time_index.append(row)
        act = temp_time_df.loc[row, 'Activity']
        if act != 'A_SUBMITTED-COMPLETE':
            anomalous_value = (np.random.random_sample() + 1)*(statistics_storage[act]['mean'] + statistics_storage[act]['std'])
            temp_time_df.loc[row, 'AnomalousDuration'] = anomalous_value
            temp_time_df.loc[row, 'TimeLabel'] = 1

In [25]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0.0,0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,194817850.0,0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,145.0,0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,768.0,0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,148.0,0


In [26]:
#get anomalous cumulative duration
temp_cum_time_df = pd.DataFrame(columns=list(temp_time_df)+['AnomalousCompleteTimestamp'])
groupByCase = temp_time_df.groupby(['CaseID'])

for case, group in groupByCase:
    group['AnomalousCompleteTimestamp'] = group['CompleteTimestamp'].copy()
    if group['TimeLabel'].sum() > 0:
        for row in range(group.shape[0]-1):
            previous_timestamp = group['CompleteTimestamp'].iloc[row]
            current_duration = group['AnomalousDuration'].iloc[row+1]
            current_timestamp = previous_timestamp + timedelta(seconds=current_duration)
            group['AnomalousCompleteTimestamp'].iloc[row+1] = current_timestamp
    temp_cum_time_df = temp_cum_time_df.append(group)

In [27]:
temp_cum_time_df.head(50)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0.0,0,2006-01-11 23:49:42.000000
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,194817800.0,0,2012-03-15 19:53:52.000000
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,145.0,0,2012-03-15 19:56:17.000000
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,768.0,0,2012-03-15 20:09:05.000000
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,148.0,0,2012-03-15 20:11:33.000000
5,2,Accepted-In Progress,2006-11-07 18:00:36,0.0,0.0,0.0,0,2006-11-07 18:00:36.000000
6,2,Accepted-In Progress,2006-11-07 21:05:44,11108.0,11108.0,11108.0,0,2006-11-07 21:05:44.000000
7,2,Accepted-Wait,2009-12-02 22:24:32,96859128.0,96870236.0,96859130.0,0,2009-12-02 22:24:32.000000
8,2,Accepted-In Progress,2011-09-03 14:09:09,55266277.0,152136513.0,55266280.0,0,2011-09-03 14:09:09.000000
9,2,Accepted-In Progress,2012-01-20 18:23:24,12024855.0,164161368.0,12024860.0,0,2012-01-20 18:23:24.000000


In [28]:
groupByCase = temp_cum_time_df.groupby(['CaseID'])
temp_time = pd.DataFrame(pd.DataFrame(columns=list(temp_cum_time_df)+['AnomalousCumDuration']))
                           
for case, group in groupByCase:
    group = calculateAnomalousCumDuration(group)
    group['AnomalousCumDuration'] = group['AnomalousCumDuration'].apply(convert2seconds)
    temp_time = temp_time.append(group)

In [29]:
temp_time.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0.0,0,2006-01-11 23:49:42,0.0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,194817850.0,0,2012-03-15 19:53:52,194817850.0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,145.0,0,2012-03-15 19:56:17,194817995.0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,768.0,0,2012-03-15 20:09:05,194818763.0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,148.0,0,2012-03-15 20:11:33,194818911.0


## Get full df

In [30]:
full_df = pd.concat([temp_time, temp_act], axis=1)

In [31]:
full_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration,AnomalousActivity,ActivityLabel
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0.0,0,2006-01-11 23:49:42,0.0,Queued-Awaiting Assignment,0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,194817850.0,0,2012-03-15 19:53:52,194817850.0,Accepted-In Progress,0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,145.0,0,2012-03-15 19:56:17,194817995.0,Accepted-Assigned,0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,768.0,0,2012-03-15 20:09:05,194818763.0,Accepted-In Progress,0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,148.0,0,2012-03-15 20:11:33,194818911.0,Completed-Closed,0


In [32]:
normal_df = full_df[['CaseID', 'Activity', 'CompleteTimestamp', 'Duration', 'CumDuration']]
anomalous_df = full_df[['CaseID', 'AnomalousActivity', 'AnomalousCompleteTimestamp', 'AnomalousDuration', 
                        'AnomalousCumDuration', 'ActivityLabel', 'TimeLabel']]

In [33]:
print('Saving dataframes...')
normal_df_name = os.path.join(args.input_dir, 'normal_df_{}.csv'.format(args.anomaly_pct))
normal_df.to_csv(normal_df_name, index=False)

anomalous_df_name = os.path.join(args.input_dir, 'anomolous_df_{}.csv'.format(args.anomaly_pct))
anomalous_df.to_csv(anomalous_df_name, index=False)
print('Done!')

Saving dataframes...
Done!


# Preprocess data

In [34]:
groupByCase = anomalous_df.groupby(['CaseID'])

# Split: 70% train, 20% validate, 20% test
train_case_num = int(len(groupByCase)*args.train_pct)
val_case_num = int(len(groupByCase)*args.val_pct)
test_case_num = len(groupByCase) - train_case_num - val_case_num

In [35]:
anomalous_df_train = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_val = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_test = pd.DataFrame(columns=list(anomalous_df))

for caseid, data_case in groupByCase:
    if caseid <= train_case_num:
        anomalous_df_train = anomalous_df_train.append(data_case)
    elif train_case_num < caseid <= (train_case_num+val_case_num):
        anomalous_df_val = anomalous_df_val.append(data_case)
    else:
        anomalous_df_test = anomalous_df_test.append(data_case)

In [36]:
print('Checking shapes of sub data: ', anomalous_df.shape[0] == anomalous_df_train.shape[0] + anomalous_df_val.shape[0] + anomalous_df_test.shape[0])

Checking shapes of sub data:  True


In [37]:
train_row_num = anomalous_df_train.shape[0]
val_row_num = anomalous_df_val.shape[0]
test_row_num = anomalous_df_test.shape[0]

print('Number of rows for training: {}'.format(train_row_num))
print('Number of rows for val: {}'.format(val_row_num))
print('Number of rows for testing: {}'.format(test_row_num))

Number of rows for training: 4493
Number of rows for val: 1096
Number of rows for testing: 1071


In [38]:
print('Number of anomalous values in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum() + anomalous_df_train['TimeLabel'].sum()))
print('Number of anomalous activities in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum()))
print('Number of anomalous time in train set: {}'.format(anomalous_df_train['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum() + anomalous_df_val['TimeLabel'].sum()))
print('Number of anomalous activities in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum()))
print('Number of anomalous time in validate set: {}'.format(anomalous_df_val['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum() + anomalous_df_test['TimeLabel'].sum()))
print('Number of anomalous activities in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum()))
print('Number of anomalous time in test set: {}'.format(anomalous_df_test['TimeLabel'].sum()))

Number of anomalous values in train set: 922
Number of anomalous activities in train set: 469
Number of anomalous time in train set: 453


Number of anomalous values in validate set: 184
Number of anomalous activities in validate set: 86
Number of anomalous time in validate set: 98


Number of anomalous values in test set: 226
Number of anomalous activities in test set: 111
Number of anomalous time in test set: 115


In [39]:
(848+226+232)/2

653.0

# Prepare input

In [40]:
anomalous_df.head()

Unnamed: 0,CaseID,AnomalousActivity,AnomalousCompleteTimestamp,AnomalousDuration,AnomalousCumDuration,ActivityLabel,TimeLabel
0,1,Queued-Awaiting Assignment,2006-01-11 23:49:42,0.0,0.0,0,0
1,1,Accepted-In Progress,2012-03-15 19:53:52,194817850.0,194817850.0,0,0
2,1,Accepted-Assigned,2012-03-15 19:56:17,145.0,194817995.0,0,0
3,1,Accepted-In Progress,2012-03-15 20:09:05,768.0,194818763.0,0,0
4,1,Completed-Closed,2012-03-15 20:11:33,148.0,194818911.0,0,0


## Labels

In [41]:
activity_label = anomalous_df['ActivityLabel']
time_label = anomalous_df['TimeLabel']

In [42]:
activity_label_train = activity_label[:train_row_num]
activity_label_val = activity_label[train_row_num:train_row_num+val_row_num]
activity_label_test = activity_label[-test_row_num:]

time_label_train = time_label[:train_row_num]
time_label_val = time_label[train_row_num:train_row_num+val_row_num]
time_label_test = time_label[-test_row_num:]

In [43]:
len(time_label_test)

1071

In [44]:
anomaly = anomalous_df[['CaseID', 'AnomalousActivity', 'AnomalousCumDuration']]

## Activity

In [45]:
cat_var = ['AnomalousActivity']

In [46]:
enc_data = OHE(anomaly, cat_var)

In [47]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_Accepted-Assigned,AnomalousActivity_Accepted-In Progress,AnomalousActivity_Accepted-Wait,AnomalousActivity_Completed-Cancelled,AnomalousActivity_Completed-Closed,AnomalousActivity_Queued-Awaiting Assignment,AnomalousActivity_Unmatched-Unmatched
0,1,0.0,0,0,0,0,0,1,0
1,1,194817850.0,0,1,0,0,0,0,0
2,1,194817995.0,1,0,0,0,0,0,0
3,1,194818763.0,0,1,0,0,0,0,0
4,1,194818911.0,0,0,0,0,1,0,0


## Time

In [48]:
min_value = np.min(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
max_value = np.max(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [49]:
print('Min used for normalization: {}'.format(min_value))
print('Max used for normalization: {}'.format(max_value))

Min used for normalization: 0.0
Max used for normalization: 194818911.0


In [50]:
mean_value = np.mean(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
std_value = np.std(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [51]:
print('Mean used for standardization: {}'.format(mean_value))
print('STD used for standardization: {}'.format(std_value))

Mean used for standardization: 12756580.9444277
STD used for standardization: 20330301.773291893


In [52]:
enc_data['NormalizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-min_value)/(max_value-min_value))
enc_data['StandardizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-mean_value)/(std_value))

In [53]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_Accepted-Assigned,AnomalousActivity_Accepted-In Progress,AnomalousActivity_Accepted-Wait,AnomalousActivity_Completed-Cancelled,AnomalousActivity_Completed-Closed,AnomalousActivity_Queued-Awaiting Assignment,AnomalousActivity_Unmatched-Unmatched,NormalizedCumDuration,StandardizedCumDuration
0,1,0.0,0,0,0,0,0,1,0,0.0,-0.627466
1,1,194817850.0,0,1,0,0,0,0,0,0.999995,8.955168
2,1,194817995.0,1,0,0,0,0,0,0,0.999995,8.955175
3,1,194818763.0,0,1,0,0,0,0,0,0.999999,8.955213
4,1,194818911.0,0,0,0,0,1,0,0,1.0,8.95522


In [54]:
if args.scaler == 'standardization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'NormalizedCumDuration'], axis=1)
if args.scaler == 'normalization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'StandardizedCumDuration'], axis=1)

In [55]:
scaled_enc_data.head()

Unnamed: 0,CaseID,AnomalousActivity_Accepted-Assigned,AnomalousActivity_Accepted-In Progress,AnomalousActivity_Accepted-Wait,AnomalousActivity_Completed-Cancelled,AnomalousActivity_Completed-Closed,AnomalousActivity_Queued-Awaiting Assignment,AnomalousActivity_Unmatched-Unmatched,StandardizedCumDuration
0,1,0,0,0,0,0,1,0,-0.627466
1,1,0,1,0,0,0,0,0,8.955168
2,1,1,0,0,0,0,0,0,8.955175
3,1,0,1,0,0,0,0,0,8.955213
4,1,0,0,0,0,1,0,0,8.95522


## 0-padding

In [56]:
#re arrange cols
cols = list(scaled_enc_data)
cols = ['CaseID', cols[-1]] + cols[1:-1]
scaled_enc_data = scaled_enc_data[cols]

In [57]:
scaled_enc_data.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_Accepted-Assigned,AnomalousActivity_Accepted-In Progress,AnomalousActivity_Accepted-Wait,AnomalousActivity_Completed-Cancelled,AnomalousActivity_Completed-Closed,AnomalousActivity_Queued-Awaiting Assignment,AnomalousActivity_Unmatched-Unmatched
0,1,-0.627466,0,0,0,0,0,1,0
1,1,8.955168,0,1,0,0,0,0,0
2,1,8.955175,1,0,0,0,0,0,0
3,1,8.955213,0,1,0,0,0,0,0
4,1,8.95522,0,0,0,0,1,0,0


In [58]:
true_time = scaled_enc_data.iloc[-test_row_num:, 1]
true_act = scaled_enc_data.iloc[-test_row_num:, 2:]

In [59]:
full_true_time = scaled_enc_data.iloc[:, 1]
full_true_act = scaled_enc_data.iloc[:, 2:]

In [60]:
cols = [i for i in list(scaled_enc_data) if i != 'CaseID']
cols

['StandardizedCumDuration',
 'AnomalousActivity_Accepted-Assigned',
 'AnomalousActivity_Accepted-In Progress',
 'AnomalousActivity_Accepted-Wait',
 'AnomalousActivity_Completed-Cancelled',
 'AnomalousActivity_Completed-Closed',
 'AnomalousActivity_Queued-Awaiting Assignment',
 'AnomalousActivity_Unmatched-Unmatched']

In [61]:
pad_index = scaled_enc_data.copy()
pad_index[cols] = 1.0

In [62]:
pad_index.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_Accepted-Assigned,AnomalousActivity_Accepted-In Progress,AnomalousActivity_Accepted-Wait,AnomalousActivity_Completed-Cancelled,AnomalousActivity_Completed-Closed,AnomalousActivity_Queued-Awaiting Assignment,AnomalousActivity_Unmatched-Unmatched
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Vectorize

In [63]:
groupByCase = scaled_enc_data.groupby(['CaseID'])

maxlen = findLongestLength(groupByCase)
print('Maxlen: ', maxlen)

Maxlen:  35


In [64]:
vectorized_data = getInput(groupByCase, cols, maxlen)

pad_index_groupByCase = pad_index.groupby(['CaseID'])
vectorized_pad_index = getInput(pad_index_groupByCase, cols, maxlen)

# Split in to train/val/test

In [65]:
print('Shape of vectorized data: {}'.format(vectorized_data.shape))
print('Shape of vectorized pad index: {}'.format(vectorized_pad_index.shape))
print('\n')
print('Number of case for train: {}'.format(train_case_num))
print('Number of case for validate: {}'.format(val_case_num))
print('Number of case for test: {}'.format(test_case_num))

Shape of vectorized data: (1487, 35, 8)
Shape of vectorized pad index: (1487, 35, 8)


Number of case for train: 892
Number of case for validate: 297
Number of case for test: 298


In [66]:
input_train = vectorized_data[0:train_case_num]
input_val = vectorized_data[train_case_num:train_case_num+val_case_num]
input_test = vectorized_data[-test_case_num:]

pad_index_train = vectorized_pad_index[0:train_case_num]
pad_index_val = vectorized_pad_index[train_case_num:train_case_num+val_case_num]
pad_index_test = vectorized_pad_index[-test_case_num:]

In [67]:
print('Check shape of input for training: {}'.format(input_train.shape[0]==train_case_num))
print('Check shape of input for validation: {}'.format(input_val.shape[0]==val_case_num))
print('Check shape of input for testing: {}'.format(input_test.shape[0]==test_case_num))

Check shape of input for training: True
Check shape of input for validation: True
Check shape of input for testing: True


# Save data

In [68]:
preprocessed_data_name = os.path.join(args.input_dir, 'preprocessed_data_{}.pkl'.format(args.anomaly_pct))
with open(preprocessed_data_name, 'wb') as f:
    pickle.dump(input_train, f, protocol=2)
    pickle.dump(input_val, f, protocol=2)
    pickle.dump(input_test, f, protocol=2)
    pickle.dump(pad_index_train, f, protocol=2)
    pickle.dump(pad_index_val, f, protocol=2)
    pickle.dump(pad_index_test, f, protocol=2)
    pickle.dump(activity_label_test, f, protocol=2)
    pickle.dump(time_label_test, f, protocol=2)
    pickle.dump(train_case_num, f, protocol=2)
    pickle.dump(val_case_num, f, protocol=2)
    pickle.dump(test_case_num, f, protocol=2)
    pickle.dump(train_row_num, f, protocol=2)
    pickle.dump(val_row_num, f, protocol=2)
    pickle.dump(test_row_num, f, protocol=2)
    pickle.dump(min_value, f, protocol=2)
    pickle.dump(max_value, f, protocol=2)
    pickle.dump(mean_value, f, protocol=2)
    pickle.dump(std_value, f, protocol=2)
    pickle.dump(cols, f, protocol=2)
    pickle.dump(statistics_storage, f, protocol=2)
    pickle.dump(true_time, f, protocol=2)
    pickle.dump(true_act, f, protocol=2)
    pickle.dump(full_true_time, f, protocol=2)
    pickle.dump(full_true_act, f, protocol=2)