In [1]:
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from dateutil.parser import parse
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None #to run loop quicker without warnings

In [3]:
name = 'bpi_2012'
#name = 'bpi_2013'

args = {
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name),  
    'train_pct': 0.6,
    'val_pct': 0.2,
    'anomaly_pct': 0.1,
    'scaler': 'standardization', 
}

args = argparse.Namespace(**args)

# Load data

In [4]:
# Only consider Case, Activity, Timestamp
cols = ['CaseID', 'Activity', 'CompleteTimestamp']

# For Timestamp: Convert to time
data = pd.read_csv(args.data_dir + args.data_file, usecols=['Case ID', 'Activity', 'Complete Timestamp'])
data['Case ID'] = data['Case ID'].apply(lambda x: x.split(' ')[1])
    

# Format for each column     
data.columns = cols
data['CompleteTimestamp'] = pd.to_datetime(data['CompleteTimestamp'], errors='coerce')
data['CaseID'] = data['CaseID'].apply(pd.to_numeric)

In [5]:
data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437


In [8]:
#Calculate duration and cumulative duration
groupByCase = data.groupby(['CaseID'])
case_dict = {}

for case, group in groupByCase:
    starting_time = group.iloc[0,2]
    ending_time = group.iloc[-1,2]
    duration = (ending_time - starting_time).total_seconds()
    case_dict[case] = duration

In [9]:
duration_df = pd.DataFrame([i for i in case_dict.items()], columns=['CaseID', 'Duration'])

In [10]:
duration_df.head()

Unnamed: 0,CaseID,Duration
0,1,1072732.0
1,2,799716.4
2,3,11855940.0
3,4,37.554
4,5,41.143


In [14]:
duration_df['Duration'].describe(percentiles=[0.25])

count    1.308700e+04
mean     7.451001e+05
std      1.047978e+06
min      1.855000e+00
25%      5.446600e+01
50%      6.985743e+04
max      1.185594e+07
Name: Duration, dtype: float64