**Outline:**

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, log_loss

In [3]:
sys.path.insert(0, './../utils/')
from utils import *

In [4]:
#Define parser
#name = 'bpi_2012'
name = 'bpi_2013'
#name = 'helpdesk'  

parser = {
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name), 
    'output_dir': './output/{}/'.format(name),
}

args = argparse.Namespace(**parser)

In [5]:
with open(args.input_dir + 'parameters.pkl', 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

In [6]:
normalized_complete_df = pd.read_csv(args.input_dir+'normalized_complete_df.csv')
normalized_missing_df = pd.read_csv(args.input_dir+'normalized_missing_df.csv')

In [7]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [8]:
missing_true_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1189,,2012-03-07 22:15:19,194135137.0,0.0
1,1189,Completed-Closed,2012-03-09 16:57:30,194288868.0,0.084317
2,1189,,2012-03-26 20:48:52,195771550.0,0.897521
3,1189,,2012-03-26 20:49:58,195771616.0,0.897558
4,1189,Accepted-In Progress,,,


In [9]:
missing_test = missing_true_test.copy()
missing_test['CompleteTimestamp'] = pd.to_datetime(missing_test['CompleteTimestamp'])
missing_test = calculateTimeInterval(missing_test)
missing_test['TimeInterval'] = missing_test['TimeInterval'].apply(convert2seconds)

In [10]:
#Replace duration of starting activity with nan
for row in range(missing_test.shape[0]):
    if missing_test['CaseID'].iloc[row] != missing_test['CaseID'].iloc[row-1]:
        missing_test.loc[row, 'TimeInterval'] = np.nan

In [11]:
pd.isnull(missing_test).sum()

CaseID                 0
Activity             558
CompleteTimestamp    552
CumTimeInterval      552
NormalizedTime       552
TimeInterval         897
dtype: int64

# Impute Time

## Dummy 1

In [12]:
missing_time = missing_test['TimeInterval']
true_time = complete_true_test['CumTimeInterval']

In [13]:
avai_time_index = []
for row in range(missing_test.shape[0]):
    if not pd.isnull(missing_test.CompleteTimestamp[row]):
        avai_time_index.append(row)

print('Check number of nan Time...')
print(missing_test.shape[0] - len(avai_time_index) == pd.isnull(missing_test).sum()['CompleteTimestamp'])

Check number of nan Time...
True


In [14]:
temp = missing_test.copy()

In [15]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1189,,2012-03-07 22:15:19,194135137.0,0.0,
1,1189,Completed-Closed,2012-03-09 16:57:30,194288868.0,0.084317,153731.0
2,1189,,2012-03-26 20:48:52,195771550.0,0.897521,1482682.0
3,1189,,2012-03-26 20:49:58,195771616.0,0.897558,66.0
4,1189,Accepted-In Progress,NaT,,,


In [35]:
temp['TimeInterval'].median(), temp['TimeInterval'].mean()

(6449.0, 420754.84699453553)

In [16]:
median_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].median())
mean_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].mean())

In [17]:
temp['Median'] = median_imputation
temp['Mean'] = mean_imputation

In [18]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [19]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [20]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

true_time = true_time.drop(true_time.index[avai_time_index])

In [21]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), mean_absolute_error(true_time, median_imputation)/86400))
print('\n')
print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation), mean_absolute_error(true_time, mean_imputation)/86400))

Impute with Median
MAE: 876504.3786 seconds | 10.1447 days


Impute with Mean
MAE: 1277215.6843 seconds | 14.7826 days


## Dummy 2

In [22]:
missing_groupByActivity = missing_test.groupby(['Activity'])

missing_median_duration_activity = {}
missing_mean_duration_activity = {}
missing_min_duration_activity = {}
missing_max_duration_activity = {}

for activity, group in missing_groupByActivity:
    missing_median_duration_activity[activity] = group['TimeInterval'].median()
    missing_mean_duration_activity[activity] = group['TimeInterval'].mean()
    missing_min_duration_activity[activity] = group['TimeInterval'].min()
    missing_max_duration_activity[activity] = group['TimeInterval'].max()

In [23]:
missing_mean_duration_activity

{'Accepted-Assigned': 86805.000000000015,
 'Accepted-In Progress': 452579.86363636365,
 'Accepted-Wait': 150340.60000000001,
 'Completed-Closed': 752528.24242424243,
 'Queued-Awaiting Assignment': 8692.4666666666672}

In [24]:
temp = missing_test.copy()

In [25]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1189,,2012-03-07 22:15:19,194135137.0,0.0,
1,1189,Completed-Closed,2012-03-09 16:57:30,194288868.0,0.084317,153731.0
2,1189,,2012-03-26 20:48:52,195771550.0,0.897521,1482682.0
3,1189,,2012-03-26 20:49:58,195771616.0,0.897558,66.0
4,1189,Accepted-In Progress,NaT,,,


In [26]:
#Replace NaN duration with median and mean
temp['Median'] = temp['TimeInterval'].copy()
temp['Mean'] = temp['TimeInterval'].copy()

for row in range(temp.shape[0]):
    if pd.isnull(temp.CumTimeInterval.loc[row]) and pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and not pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[temp.Activity.loc[row]]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[temp.Activity.loc[row]]

In [27]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [28]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [29]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

In [30]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), mean_absolute_error(true_time, median_imputation)/86400))
print('\n')
print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation), mean_absolute_error(true_time, mean_imputation)/86400))

Impute with Median
MAE: 918151.8641 seconds | 10.6268 days


Impute with Mean
MAE: 1294320.8335 seconds | 14.9806 days


# Impute Activity

In [31]:
missing_activity = missing_test['Activity'].copy()
true_activity = complete_true_test['Activity'].copy()

In [32]:
avai_activity_index = []
for row in range(missing_test.shape[0]):
    if type(missing_test.Activity[row]) == str:
        avai_activity_index.append(row)

print('Check number of nan Activity...')
print(missing_test.shape[0] - len(avai_activity_index) == pd.isnull(missing_test).sum()['Activity'])

Check number of nan Activity...
True


In [33]:
def evalDummyActivity(missing_df_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index):
    
    # Impute nan
    imputed_activity = missing_activity.fillna(value=most_frequent_activity)
    
    # Drop availabel row and keep nan row
    imputed_activity = imputed_activity.drop(imputed_activity.index[avai_activity_index])
    true_activity = true_activity.drop(true_activity.index[avai_activity_index])
    
    # Check number of nan values
    print('Impute missing activities with Most frequent activity...')
    print('Accuracy: {:.2f}%'.format(accuracy_score(true_activity, imputed_activity)*100))

In [34]:
evalDummyActivity(missing_true_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index)

Impute missing activities with Most frequent activity...
Accuracy: 48.21%
