In [1]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
name = 'bpi_2012'
#name = 'bpi_2013'

args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
    'scaler': 'standardization', 
}

args = argparse.Namespace(**args)

In [4]:
if not os.path.isdir('../input/'):
    os.makedirs('../input/')
    
if not os.path.isdir(args.input_dir):
    os.makedirs(args.input_dir)

In [5]:
sys.path.insert(0, './../utils/')
from utils import *

# Load data

In [6]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [7]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437


In [8]:
#Calculate duration and cumulative duration
groupByCase = data.groupby(['CaseID'])
duration_df = pd.DataFrame(pd.DataFrame(columns=list(data)+['Duration', 'CumDuration']))
                           
for case, group in groupByCase:
    group = calculateDuration(group)
    group = calculateCumDuration(group)
    group['Duration'] = group['Duration'].apply(convert2seconds)
    group['CumDuration'] = group['CumDuration'].apply(convert2seconds)
    duration_df = duration_df.append(group)

In [9]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891
5,1,A_ACCEPTED-COMPLETE,2011-10-01 18:42:43.308,356.871,39838.762
6,1,O_SELECTED-COMPLETE,2011-10-01 18:45:09.243,145.935,39984.697
7,1,A_FINALIZED-COMPLETE,2011-10-01 18:45:09.243,0.0,39984.697
8,1,O_CREATED-COMPLETE,2011-10-01 18:45:11.197,1.954,39986.651
9,1,O_SENT-COMPLETE,2011-10-01 18:45:11.380,0.183,39986.834


In [10]:
#get statistics storage for activity
groupByActivity = duration_df.groupby(['Activity'])
statistics_storage = {}

for act, act_data in groupByActivity:
    act_storage = {}
    act_storage[act] = {}
    mean_value = act_data['Duration'].mean()
    std_value = act_data['Duration'].std()
    act_storage[act]['mean'] = mean_value
    act_storage[act]['std'] = std_value
    statistics_storage.update(act_storage)

In [11]:
print('Descriptive statistics: \n{}'.format(statistics_storage))

Descriptive statistics: 
{'A_REGISTERED-COMPLETE': {'mean': 209.03123152270703, 'std': 5203.8654100325557}, 'A_APPROVED-COMPLETE': {'mean': 222.82031923419413, 'std': 672.3596876633876}, 'W_Nabellen incomplete dossiers-SCHEDULE': {'mean': 1095.6721074276124, 'std': 989.6464247749368}, 'W_Beoordelen fraude-COMPLETE': {'mean': 546.15606296296301, 'std': 5785.1894701118226}, 'A_CANCELLED-COMPLETE': {'mean': 22143.708828998937, 'std': 188709.42073353639}, 'W_Nabellen offertes-COMPLETE': {'mean': 12720.037169959962, 'std': 115166.40029792135}, 'A_FINALIZED-COMPLETE': {'mean': 95.104058225324039, 'std': 160.52545167209442}, 'O_SELECTED-COMPLETE': {'mean': 134.31678620199148, 'std': 249.11607254600929}, 'O_DECLINED-COMPLETE': {'mean': 524.3738266832919, 'std': 799.24105124767698}, 'W_Afhandelen leads-SCHEDULE': {'mean': 27.566883462586464, 'std': 19.827023896666763}, 'O_CREATED-COMPLETE': {'mean': 4.2928933143669985, 'std': 11.577286082595871}, 'W_Beoordelen fraude-START': {'mean': 61002.1507

In [12]:
act_list = data['Activity'].unique()
print('Activity: {}'.format(act_list))

Activity: ['A_SUBMITTED-COMPLETE' 'A_PARTLYSUBMITTED-COMPLETE'
 'A_PREACCEPTED-COMPLETE' 'W_Completeren aanvraag-SCHEDULE'
 'W_Completeren aanvraag-START' 'A_ACCEPTED-COMPLETE' 'O_SELECTED-COMPLETE'
 'A_FINALIZED-COMPLETE' 'O_CREATED-COMPLETE' 'O_SENT-COMPLETE'
 'W_Nabellen offertes-SCHEDULE' 'W_Completeren aanvraag-COMPLETE'
 'W_Nabellen offertes-START' 'W_Nabellen offertes-COMPLETE'
 'O_SENT_BACK-COMPLETE' 'W_Valideren aanvraag-SCHEDULE'
 'W_Valideren aanvraag-START' 'A_REGISTERED-COMPLETE' 'A_APPROVED-COMPLETE'
 'O_ACCEPTED-COMPLETE' 'A_ACTIVATED-COMPLETE'
 'W_Valideren aanvraag-COMPLETE' 'O_CANCELLED-COMPLETE'
 'W_Wijzigen contractgegevens-SCHEDULE' 'A_DECLINED-COMPLETE'
 'A_CANCELLED-COMPLETE' 'W_Afhandelen leads-SCHEDULE'
 'W_Afhandelen leads-START' 'W_Afhandelen leads-COMPLETE'
 'O_DECLINED-COMPLETE' 'W_Nabellen incomplete dossiers-SCHEDULE'
 'W_Nabellen incomplete dossiers-START'
 'W_Nabellen incomplete dossiers-COMPLETE' 'W_Beoordelen fraude-SCHEDULE'
 'W_Beoordelen fraude-STA

# Introduce anomalous data

In [13]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437


In [14]:
duration_df.head(10)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891
5,1,A_ACCEPTED-COMPLETE,2011-10-01 18:42:43.308,356.871,39838.762
6,1,O_SELECTED-COMPLETE,2011-10-01 18:45:09.243,145.935,39984.697
7,1,A_FINALIZED-COMPLETE,2011-10-01 18:45:09.243,0.0,39984.697
8,1,O_CREATED-COMPLETE,2011-10-01 18:45:11.197,1.954,39986.651
9,1,O_SENT-COMPLETE,2011-10-01 18:45:11.380,0.183,39986.834


In [15]:
anomaly_num = int(data.shape[0]*(data.shape[1]-1)*args.anomaly_pct)
anomalous_act_num = int(anomaly_num/2)
anomalous_time_num = anomaly_num - anomalous_act_num

print('Number of anomalous values: {}'.format(anomaly_num))
print('Number of anomalous activities: {}'.format(anomalous_act_num))
print('Number of anomalous time: {}'.format(anomalous_time_num))

Number of anomalous values: 52440
Number of anomalous activities: 26220
Number of anomalous time: 26220


## Activity

**Mutation:**
- Replace an activity by another

In [16]:
temp_act_df = pd.DataFrame({'Activity': duration_df['Activity'].copy(),
                            'AnomalousActivity': duration_df['Activity'].copy(),
                            'ActivityLabel': 0})

In [17]:
temp_act_df.head()

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,A_SUBMITTED-COMPLETE,0,A_SUBMITTED-COMPLETE
1,A_PARTLYSUBMITTED-COMPLETE,0,A_PARTLYSUBMITTED-COMPLETE
2,A_PREACCEPTED-COMPLETE,0,A_PREACCEPTED-COMPLETE
3,W_Completeren aanvraag-SCHEDULE,0,W_Completeren aanvraag-SCHEDULE
4,W_Completeren aanvraag-START,0,W_Completeren aanvraag-START


In [18]:
anomalous_act_index = []

while len(anomalous_act_index) < anomalous_act_num:
    row = np.random.randint(0, temp_act_df.shape[0])
    idx = np.random.randint(0, len(act_list)-1)
    if row not in anomalous_act_index:
        anomalous_act_index.append(row)
        act = temp_act_df.loc[row, 'Activity']
        anomalous_act_list = [i for i in act_list if i != act]
        anomalous_act = anomalous_act_list[idx]                  
        temp_act_df.loc[row, 'AnomalousActivity'] = anomalous_act
        temp_act_df.loc[row, 'ActivityLabel'] = 1

In [19]:
temp_act_df.head(50)

Unnamed: 0,Activity,ActivityLabel,AnomalousActivity
0,A_SUBMITTED-COMPLETE,0,A_SUBMITTED-COMPLETE
1,A_PARTLYSUBMITTED-COMPLETE,0,A_PARTLYSUBMITTED-COMPLETE
2,A_PREACCEPTED-COMPLETE,1,A_ACTIVATED-COMPLETE
3,W_Completeren aanvraag-SCHEDULE,0,W_Completeren aanvraag-SCHEDULE
4,W_Completeren aanvraag-START,0,W_Completeren aanvraag-START
5,A_ACCEPTED-COMPLETE,0,A_ACCEPTED-COMPLETE
6,O_SELECTED-COMPLETE,0,O_SELECTED-COMPLETE
7,A_FINALIZED-COMPLETE,0,A_FINALIZED-COMPLETE
8,O_CREATED-COMPLETE,0,O_CREATED-COMPLETE
9,O_SENT-COMPLETE,0,O_SENT-COMPLETE


```python
#swap 2 activity within a case
groupByCase = duration_df.groupby(['CaseID'])

anomalous_act_index = []
caseid_list = []
temp_df = duration_df.copy()
temp_df['AnomalousActivity'] = temp_df['Activity'].copy()
temp_df['ActivityLabel'] = 0

while len(anomalous_act_index) < anomalous_act_num:
    caseid = np.random.randint(1, len(groupByCase))
    if caseid not in caseid_list:
        group = groupByCase.get_group(caseid)
        row1 = np.random.randint(0, group.shape[0])
        row2 = np.random.randint(0, group.shape[0])
        index1 = group.index.values[row1]
        index2 = group.index.values[row2]
        act1 = duration_df['Activity'].iloc[index1]
        act2 = duration_df['Activity'].iloc[index2]
        if act1 != act2:
            anomalous_act_index.append(index1)
            anomalous_act_index.append(index2)
            temp_df['AnomalousActivity'].iloc[index1] = act2
            temp_df['AnomalousActivity'].iloc[index2] = act1
            temp_df['ActivityLabel'].iloc[index1] = 1
            temp_df['ActivityLabel'].iloc[index2] = 1
            
temp_act = temp_df[['AnomalousActivity', 'ActivityLabel']]
```

In [20]:
temp_act = temp_act_df[['AnomalousActivity', 'ActivityLabel']]

In [21]:
temp_act.head()

Unnamed: 0,AnomalousActivity,ActivityLabel
0,A_SUBMITTED-COMPLETE,0
1,A_PARTLYSUBMITTED-COMPLETE,0
2,A_ACTIVATED-COMPLETE,1
3,W_Completeren aanvraag-SCHEDULE,0
4,W_Completeren aanvraag-START,0


## Time

**Mutation:**
- Extreme duration

In [22]:
temp_time_df = duration_df.copy()
temp_time_df['AnomalousDuration'] = temp_time_df['Duration'].copy()
temp_time_df['TimeLabel'] = 0

In [23]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0,0.0,0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334,0.334,0
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36,53.026,0
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329,0.969,0
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891,39427.562,0


In [24]:
#get anomalous duration
anomalous_time_index = []

while len(anomalous_time_index) < anomalous_time_num:
    row = np.random.randint(0, temp_time_df.shape[0])
    if row not in anomalous_time_index:
        anomalous_time_index.append(row)
        act = temp_time_df.loc[row, 'Activity']
        anomalous_value = (np.random.random_sample() + 1)*(statistics_storage[act]['mean'] + statistics_storage[act]['std'])
        temp_time_df.loc[row, 'AnomalousDuration'] = anomalous_value
        temp_time_df.loc[row, 'TimeLabel'] = 1

In [25]:
temp_time_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0,0.0,1
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334,0.334,0
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36,53.026,0
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329,10.723731,1
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891,39427.562,0


In [26]:
#get anomalous cumulative duration
temp_cum_time_df = pd.DataFrame(columns=list(temp_time_df)+['AnomalousCompleteTimestamp'])
groupByCase = temp_time_df.groupby(['CaseID'])

for case, group in groupByCase:
    group['AnomalousCompleteTimestamp'] = group['CompleteTimestamp'].copy()
    if group['TimeLabel'].sum() > 0:
        for row in range(group.shape[0]-1):
            previous_timestamp = group['CompleteTimestamp'].iloc[row]
            current_duration = group['AnomalousDuration'].iloc[row+1]
            current_timestamp = previous_timestamp + timedelta(seconds=current_duration)
            group['AnomalousCompleteTimestamp'].iloc[row+1] = current_timestamp
    temp_cum_time_df = temp_cum_time_df.append(group)

In [27]:
temp_cum_time_df.head(11)

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0,0.0,1,2011-10-01 07:38:44.546000
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334,0.334,0,2011-10-01 07:38:44.880000
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36,53.026,0,2011-10-01 07:39:37.906000
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329,10.723731,1,2011-10-01 07:39:48.629731
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891,39427.562,0,2011-10-01 18:36:46.437000
5,1,A_ACCEPTED-COMPLETE,2011-10-01 18:42:43.308,356.871,39838.762,1038.213779,1,2011-10-01 18:54:04.650779
6,1,O_SELECTED-COMPLETE,2011-10-01 18:45:09.243,145.935,39984.697,553.388849,1,2011-10-01 18:51:56.696849
7,1,A_FINALIZED-COMPLETE,2011-10-01 18:45:09.243,0.0,39984.697,0.0,0,2011-10-01 18:45:09.243000
8,1,O_CREATED-COMPLETE,2011-10-01 18:45:11.197,1.954,39986.651,1.954,0,2011-10-01 18:45:11.197000
9,1,O_SENT-COMPLETE,2011-10-01 18:45:11.380,0.183,39986.834,0.183,0,2011-10-01 18:45:11.380000


In [28]:
groupByCase = temp_cum_time_df.groupby(['CaseID'])
temp_time = pd.DataFrame(pd.DataFrame(columns=list(temp_cum_time_df)+['AnomalousCumDuration']))
                           
for case, group in groupByCase:
    group = calculateAnomalousCumDuration(group)
    group['AnomalousCumDuration'] = group['AnomalousCumDuration'].apply(convert2seconds)
    temp_time = temp_time.append(group)

In [29]:
temp_time.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0,0.0,1,2011-10-01 07:38:44.546000,0.0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334,0.334,0,2011-10-01 07:38:44.880000,0.334
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36,53.026,0,2011-10-01 07:39:37.906000,53.36
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329,10.723731,1,2011-10-01 07:39:48.629731,64.083731
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891,39427.562,0,2011-10-01 18:36:46.437000,39481.891


## Get full df

In [30]:
full_df = pd.concat([temp_time, temp_act], axis=1)

In [31]:
full_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,Duration,CumDuration,AnomalousDuration,TimeLabel,AnomalousCompleteTimestamp,AnomalousCumDuration,AnomalousActivity,ActivityLabel
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0,0.0,1,2011-10-01 07:38:44.546000,0.0,A_SUBMITTED-COMPLETE,0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,0.334,0.334,0,2011-10-01 07:38:44.880000,0.334,A_PARTLYSUBMITTED-COMPLETE,0
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.026,53.36,53.026,0,2011-10-01 07:39:37.906000,53.36,A_ACTIVATED-COMPLETE,1
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,0.969,54.329,10.723731,1,2011-10-01 07:39:48.629731,64.083731,W_Completeren aanvraag-SCHEDULE,0
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39427.562,39481.891,39427.562,0,2011-10-01 18:36:46.437000,39481.891,W_Completeren aanvraag-START,0


In [32]:
normal_df = full_df[['CaseID', 'Activity', 'CompleteTimestamp', 'Duration', 'CumDuration']]
anomalous_df = full_df[['CaseID', 'AnomalousActivity', 'AnomalousCompleteTimestamp', 'AnomalousDuration', 
                        'AnomalousCumDuration', 'ActivityLabel', 'TimeLabel']]

In [33]:
print('Saving dataframes...')
normal_df_name = os.path.join(args.input_dir, 'normal_df_{}.csv'.format(args.anomaly_pct))
normal_df.to_csv(normal_df_name, index=False)

anomalous_df_name = os.path.join(args.input_dir, 'anomolous_df_{}.csv'.format(args.anomaly_pct))
anomalous_df.to_csv(anomalous_df_name, index=False)
print('Done!')

Saving dataframes...
Done!


# Preprocess data

In [34]:
groupByCase = anomalous_df.groupby(['CaseID'])

# Split: 70% train, 20% validate, 20% test
train_case_num = int(len(groupByCase)*args.train_pct)
val_case_num = int(len(groupByCase)*args.val_pct)
test_case_num = len(groupByCase) - train_case_num - val_case_num

In [35]:
anomalous_df_train = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_val = pd.DataFrame(columns=list(anomalous_df))
anomalous_df_test = pd.DataFrame(columns=list(anomalous_df))

for caseid, data_case in groupByCase:
    if caseid <= train_case_num:
        anomalous_df_train = anomalous_df_train.append(data_case)
    elif train_case_num < caseid <= (train_case_num+val_case_num):
        anomalous_df_val = anomalous_df_val.append(data_case)
    else:
        anomalous_df_test = anomalous_df_test.append(data_case)

In [36]:
print('Checking shapes of sub data: ', anomalous_df.shape[0] == anomalous_df_train.shape[0] + anomalous_df_val.shape[0] + anomalous_df_test.shape[0])

Checking shapes of sub data:  True


In [37]:
train_row_num = anomalous_df_train.shape[0]
val_row_num = anomalous_df_val.shape[0]
test_row_num = anomalous_df_test.shape[0]

print('Number of rows for training: {}'.format(train_row_num))
print('Number of rows for val: {}'.format(val_row_num))
print('Number of rows for testing: {}'.format(test_row_num))

Number of rows for training: 158706
Number of rows for val: 55671
Number of rows for testing: 47823


In [38]:
print('Number of anomalous values in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum() + anomalous_df_train['TimeLabel'].sum()))
print('Number of anomalous activities in train set: {}'.format(anomalous_df_train['ActivityLabel'].sum()))
print('Number of anomalous time in train set: {}'.format(anomalous_df_train['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum() + anomalous_df_val['TimeLabel'].sum()))
print('Number of anomalous activities in validate set: {}'.format(anomalous_df_val['ActivityLabel'].sum()))
print('Number of anomalous time in validate set: {}'.format(anomalous_df_val['TimeLabel'].sum()))
print('\n')
print('Number of anomalous values in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum() + anomalous_df_test['TimeLabel'].sum()))
print('Number of anomalous activities in test set: {}'.format(anomalous_df_test['ActivityLabel'].sum()))
print('Number of anomalous time in test set: {}'.format(anomalous_df_test['TimeLabel'].sum()))

Number of anomalous values in train set: 31843
Number of anomalous activities in train set: 15924
Number of anomalous time in train set: 15919


Number of anomalous values in validate set: 11144
Number of anomalous activities in validate set: 5575
Number of anomalous time in validate set: 5569


Number of anomalous values in test set: 9453
Number of anomalous activities in test set: 4721
Number of anomalous time in test set: 4732


In [39]:
(848+226+232)/2

653.0

# Prepare input

In [40]:
anomalous_df.head()

Unnamed: 0,CaseID,AnomalousActivity,AnomalousCompleteTimestamp,AnomalousDuration,AnomalousCumDuration,ActivityLabel,TimeLabel
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546000,0.0,0.0,0,1
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880000,0.334,0.334,0,0
2,1,A_ACTIVATED-COMPLETE,2011-10-01 07:39:37.906000,53.026,53.36,1,0
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:48.629731,10.723731,64.083731,0,1
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437000,39427.562,39481.891,0,0


## Labels

In [41]:
activity_label = anomalous_df['ActivityLabel']
time_label = anomalous_df['TimeLabel']

In [42]:
activity_label_train = activity_label[:train_row_num]
activity_label_val = activity_label[train_row_num:train_row_num+val_row_num]
activity_label_test = activity_label[-test_row_num:]

time_label_train = time_label[:train_row_num]
time_label_val = time_label[train_row_num:train_row_num+val_row_num]
time_label_test = time_label[-test_row_num:]

In [43]:
len(time_label_test)

47823

In [44]:
anomaly = anomalous_df[['CaseID', 'AnomalousActivity', 'AnomalousCumDuration']]

## Activity

In [45]:
cat_var = ['AnomalousActivity']

In [46]:
enc_data = OHE(anomaly, cat_var)

In [47]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_A_ACCEPTED-COMPLETE,AnomalousActivity_A_ACTIVATED-COMPLETE,AnomalousActivity_A_APPROVED-COMPLETE,AnomalousActivity_A_CANCELLED-COMPLETE,AnomalousActivity_A_DECLINED-COMPLETE,AnomalousActivity_A_FINALIZED-COMPLETE,AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE,AnomalousActivity_A_PREACCEPTED-COMPLETE,...,AnomalousActivity_W_Nabellen incomplete dossiers-COMPLETE,AnomalousActivity_W_Nabellen incomplete dossiers-SCHEDULE,AnomalousActivity_W_Nabellen incomplete dossiers-START,AnomalousActivity_W_Nabellen offertes-COMPLETE,AnomalousActivity_W_Nabellen offertes-SCHEDULE,AnomalousActivity_W_Nabellen offertes-START,AnomalousActivity_W_Valideren aanvraag-COMPLETE,AnomalousActivity_W_Valideren aanvraag-SCHEDULE,AnomalousActivity_W_Valideren aanvraag-START,AnomalousActivity_W_Wijzigen contractgegevens-SCHEDULE
0,1,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.334,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,53.36,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,64.083731,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,39481.891,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Time

In [48]:
min_value = np.min(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
max_value = np.max(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [49]:
print('Min used for normalization: {}'.format(min_value))
print('Max used for normalization: {}'.format(max_value))

Min used for normalization: 0.0
Max used for normalization: 11855936.012


In [50]:
mean_value = np.mean(enc_data['AnomalousCumDuration'].iloc[:train_row_num])
std_value = np.std(enc_data['AnomalousCumDuration'].iloc[:train_row_num])

In [51]:
print('Mean used for standardization: {}'.format(mean_value))
print('STD used for standardization: {}'.format(std_value))

Mean used for standardization: 704916.7378895849
STD used for standardization: 978966.375592035


In [52]:
enc_data['NormalizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-min_value)/(max_value-min_value))
enc_data['StandardizedCumDuration'] = enc_data['AnomalousCumDuration'].apply(lambda x: (x-mean_value)/(std_value))

In [53]:
enc_data.head()

Unnamed: 0,CaseID,AnomalousCumDuration,AnomalousActivity_A_ACCEPTED-COMPLETE,AnomalousActivity_A_ACTIVATED-COMPLETE,AnomalousActivity_A_APPROVED-COMPLETE,AnomalousActivity_A_CANCELLED-COMPLETE,AnomalousActivity_A_DECLINED-COMPLETE,AnomalousActivity_A_FINALIZED-COMPLETE,AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE,AnomalousActivity_A_PREACCEPTED-COMPLETE,...,AnomalousActivity_W_Nabellen incomplete dossiers-START,AnomalousActivity_W_Nabellen offertes-COMPLETE,AnomalousActivity_W_Nabellen offertes-SCHEDULE,AnomalousActivity_W_Nabellen offertes-START,AnomalousActivity_W_Valideren aanvraag-COMPLETE,AnomalousActivity_W_Valideren aanvraag-SCHEDULE,AnomalousActivity_W_Valideren aanvraag-START,AnomalousActivity_W_Wijzigen contractgegevens-SCHEDULE,NormalizedCumDuration,StandardizedCumDuration
0,1,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,-0.720062
1,1,0.334,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2.817154e-08,-0.720062
2,1,53.36,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.500699e-06,-0.720008
3,1,64.083731,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.405202e-06,-0.719997
4,1,39481.891,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.003330137,-0.679732


In [54]:
if args.scaler == 'standardization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'NormalizedCumDuration'], axis=1)
if args.scaler == 'normalization':
    scaled_enc_data = enc_data.drop(['AnomalousCumDuration', 'StandardizedCumDuration'], axis=1)

In [55]:
scaled_enc_data.head()

Unnamed: 0,CaseID,AnomalousActivity_A_ACCEPTED-COMPLETE,AnomalousActivity_A_ACTIVATED-COMPLETE,AnomalousActivity_A_APPROVED-COMPLETE,AnomalousActivity_A_CANCELLED-COMPLETE,AnomalousActivity_A_DECLINED-COMPLETE,AnomalousActivity_A_FINALIZED-COMPLETE,AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE,AnomalousActivity_A_PREACCEPTED-COMPLETE,AnomalousActivity_A_REGISTERED-COMPLETE,...,AnomalousActivity_W_Nabellen incomplete dossiers-SCHEDULE,AnomalousActivity_W_Nabellen incomplete dossiers-START,AnomalousActivity_W_Nabellen offertes-COMPLETE,AnomalousActivity_W_Nabellen offertes-SCHEDULE,AnomalousActivity_W_Nabellen offertes-START,AnomalousActivity_W_Valideren aanvraag-COMPLETE,AnomalousActivity_W_Valideren aanvraag-SCHEDULE,AnomalousActivity_W_Valideren aanvraag-START,AnomalousActivity_W_Wijzigen contractgegevens-SCHEDULE,StandardizedCumDuration
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.720062
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-0.720062
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.720008
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.719997
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.679732


## 0-padding

In [56]:
#re arrange cols
cols = list(scaled_enc_data)
cols = ['CaseID', cols[-1]] + cols[1:-1]
scaled_enc_data = scaled_enc_data[cols]

In [57]:
scaled_enc_data.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_A_ACCEPTED-COMPLETE,AnomalousActivity_A_ACTIVATED-COMPLETE,AnomalousActivity_A_APPROVED-COMPLETE,AnomalousActivity_A_CANCELLED-COMPLETE,AnomalousActivity_A_DECLINED-COMPLETE,AnomalousActivity_A_FINALIZED-COMPLETE,AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE,AnomalousActivity_A_PREACCEPTED-COMPLETE,...,AnomalousActivity_W_Nabellen incomplete dossiers-COMPLETE,AnomalousActivity_W_Nabellen incomplete dossiers-SCHEDULE,AnomalousActivity_W_Nabellen incomplete dossiers-START,AnomalousActivity_W_Nabellen offertes-COMPLETE,AnomalousActivity_W_Nabellen offertes-SCHEDULE,AnomalousActivity_W_Nabellen offertes-START,AnomalousActivity_W_Valideren aanvraag-COMPLETE,AnomalousActivity_W_Valideren aanvraag-SCHEDULE,AnomalousActivity_W_Valideren aanvraag-START,AnomalousActivity_W_Wijzigen contractgegevens-SCHEDULE
0,1,-0.720062,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-0.720062,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,-0.720008,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,-0.719997,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,-0.679732,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
true_time = scaled_enc_data.iloc[-test_row_num:, 1]
true_act = scaled_enc_data.iloc[-test_row_num:, 2:]

In [59]:
cols = [i for i in list(scaled_enc_data) if i != 'CaseID']
cols

['StandardizedCumDuration',
 'AnomalousActivity_A_ACCEPTED-COMPLETE',
 'AnomalousActivity_A_ACTIVATED-COMPLETE',
 'AnomalousActivity_A_APPROVED-COMPLETE',
 'AnomalousActivity_A_CANCELLED-COMPLETE',
 'AnomalousActivity_A_DECLINED-COMPLETE',
 'AnomalousActivity_A_FINALIZED-COMPLETE',
 'AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE',
 'AnomalousActivity_A_PREACCEPTED-COMPLETE',
 'AnomalousActivity_A_REGISTERED-COMPLETE',
 'AnomalousActivity_A_SUBMITTED-COMPLETE',
 'AnomalousActivity_O_ACCEPTED-COMPLETE',
 'AnomalousActivity_O_CANCELLED-COMPLETE',
 'AnomalousActivity_O_CREATED-COMPLETE',
 'AnomalousActivity_O_DECLINED-COMPLETE',
 'AnomalousActivity_O_SELECTED-COMPLETE',
 'AnomalousActivity_O_SENT-COMPLETE',
 'AnomalousActivity_O_SENT_BACK-COMPLETE',
 'AnomalousActivity_W_Afhandelen leads-COMPLETE',
 'AnomalousActivity_W_Afhandelen leads-SCHEDULE',
 'AnomalousActivity_W_Afhandelen leads-START',
 'AnomalousActivity_W_Beoordelen fraude-COMPLETE',
 'AnomalousActivity_W_Beoordelen fraude-SCHEDUL

In [60]:
pad_index = scaled_enc_data.copy()
pad_index[cols] = 1.0

In [61]:
pad_index.head()

Unnamed: 0,CaseID,StandardizedCumDuration,AnomalousActivity_A_ACCEPTED-COMPLETE,AnomalousActivity_A_ACTIVATED-COMPLETE,AnomalousActivity_A_APPROVED-COMPLETE,AnomalousActivity_A_CANCELLED-COMPLETE,AnomalousActivity_A_DECLINED-COMPLETE,AnomalousActivity_A_FINALIZED-COMPLETE,AnomalousActivity_A_PARTLYSUBMITTED-COMPLETE,AnomalousActivity_A_PREACCEPTED-COMPLETE,...,AnomalousActivity_W_Nabellen incomplete dossiers-COMPLETE,AnomalousActivity_W_Nabellen incomplete dossiers-SCHEDULE,AnomalousActivity_W_Nabellen incomplete dossiers-START,AnomalousActivity_W_Nabellen offertes-COMPLETE,AnomalousActivity_W_Nabellen offertes-SCHEDULE,AnomalousActivity_W_Nabellen offertes-START,AnomalousActivity_W_Valideren aanvraag-COMPLETE,AnomalousActivity_W_Valideren aanvraag-SCHEDULE,AnomalousActivity_W_Valideren aanvraag-START,AnomalousActivity_W_Wijzigen contractgegevens-SCHEDULE
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Vectorize

In [62]:
groupByCase = scaled_enc_data.groupby(['CaseID'])

maxlen = findLongestLength(groupByCase)
print('Maxlen: ', maxlen)

Maxlen:  175


In [63]:
vectorized_data = getInput(groupByCase, cols, maxlen)

pad_index_groupByCase = pad_index.groupby(['CaseID'])
vectorized_pad_index = getInput(pad_index_groupByCase, cols, maxlen)

# Split in to train/val/test

In [64]:
print('Shape of vectorized data: {}'.format(vectorized_data.shape))
print('Shape of vectorized pad index: {}'.format(vectorized_pad_index.shape))
print('\n')
print('Number of case for train: {}'.format(train_case_num))
print('Number of case for validate: {}'.format(val_case_num))
print('Number of case for test: {}'.format(test_case_num))

Shape of vectorized data: (13087, 175, 37)
Shape of vectorized pad index: (13087, 175, 37)


Number of case for train: 7852
Number of case for validate: 2617
Number of case for test: 2618


In [65]:
input_train = vectorized_data[0:train_case_num]
input_val = vectorized_data[train_case_num:train_case_num+val_case_num]
input_test = vectorized_data[-test_case_num:]

pad_index_train = vectorized_pad_index[0:train_case_num]
pad_index_val = vectorized_pad_index[train_case_num:train_case_num+val_case_num]
pad_index_test = vectorized_pad_index[-test_case_num:]

In [66]:
print('Check shape of input for training: {}'.format(input_train.shape[0]==train_case_num))
print('Check shape of input for validation: {}'.format(input_val.shape[0]==val_case_num))
print('Check shape of input for testing: {}'.format(input_test.shape[0]==test_case_num))

Check shape of input for training: True
Check shape of input for validation: True
Check shape of input for testing: True


# Save data

In [67]:
preprocessed_data_name = os.path.join(args.input_dir, 'preprocessed_data_{}.pkl'.format(args.anomaly_pct))
with open(preprocessed_data_name, 'wb') as f:
    pickle.dump(input_train, f, protocol=2)
    pickle.dump(input_val, f, protocol=2)
    pickle.dump(input_test, f, protocol=2)
    pickle.dump(pad_index_train, f, protocol=2)
    pickle.dump(pad_index_val, f, protocol=2)
    pickle.dump(pad_index_test, f, protocol=2)
    pickle.dump(activity_label_test, f, protocol=2)
    pickle.dump(time_label_test, f, protocol=2)
    pickle.dump(train_case_num, f, protocol=2)
    pickle.dump(val_case_num, f, protocol=2)
    pickle.dump(test_case_num, f, protocol=2)
    pickle.dump(train_row_num, f, protocol=2)
    pickle.dump(val_row_num, f, protocol=2)
    pickle.dump(test_row_num, f, protocol=2)
    pickle.dump(min_value, f, protocol=2)
    pickle.dump(max_value, f, protocol=2)
    pickle.dump(mean_value, f, protocol=2)
    pickle.dump(std_value, f, protocol=2)
    pickle.dump(cols, f, protocol=2)
    pickle.dump(statistics_storage, f, protocol=2)
    pickle.dump(true_time, f, protocol=2)
    pickle.dump(true_act, f, protocol=2)