**Outline:**

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle
from math import sqrt

In [2]:
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, log_loss

In [3]:
sys.path.insert(0, './../utils/')
from utils import *

In [4]:
#Define parser
#name = 'bpi_2012'
name = 'bpi_2013'
#name = 'helpdesk'  

parser = {
    'data_file': name + '.csv',
    'nan_pct': 0.5,
    'input_dir': '../input/{}/'.format(name), 
    'output_dir': './output/{}/'.format(name),
}

args = argparse.Namespace(**parser)

In [5]:
file_name = os.path.join(args.input_dir, 'parameters_{}.pkl'.format(args.nan_pct))
with open(file_name, 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

In [6]:
normalized_complete_df_name = os.path.join(args.input_dir, 'normalized_complete_df_{}.pkl'.format(args.nan_pct))
normalized_complete_df = pd.read_csv(normalized_complete_df_name)

normalized_missing_df_name = os.path.join(args.input_dir, 'normalized_missing_df_{}.pkl'.format(args.nan_pct))
normalized_missing_df = pd.read_csv(normalized_missing_df_name)

In [7]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [8]:
missing_true_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1190,,2012-03-07 22:24:22,194135680.0,0.0
1,1190,,2012-04-11 21:36:11,197156789.0,1.0
2,1191,Accepted-In Progress,2012-03-07 22:26:08,194135786.0,0.0
3,1191,,2012-04-25 16:42:09,198348747.0,1.0
4,1192,,2012-03-08 00:06:16,194141794.0,0.0


In [9]:
missing_test = missing_true_test.copy()
missing_test['CompleteTimestamp'] = pd.to_datetime(missing_test['CompleteTimestamp'])
missing_test = calculateTimeInterval(missing_test)
missing_test['TimeInterval'] = missing_test['TimeInterval'].apply(convert2seconds)

In [10]:
#Replace duration of starting activity with nan
for row in range(missing_test.shape[0]):
    if missing_test['CaseID'].iloc[row] != missing_test['CaseID'].iloc[row-1]:
        missing_test.loc[row, 'TimeInterval'] = np.nan

In [11]:
pd.isnull(missing_test).sum()

CaseID                 0
Activity             545
CompleteTimestamp    548
CumTimeInterval      548
NormalizedTime       548
TimeInterval         891
dtype: int64

# Impute Time

## Dummy 1

In [12]:
missing_time = missing_test['TimeInterval']
true_time = complete_true_test['CumTimeInterval']

In [13]:
avai_time_index = []
for row in range(missing_test.shape[0]):
    if not pd.isnull(missing_test.CompleteTimestamp[row]):
        avai_time_index.append(row)

print('Check number of nan Time...')
print(missing_test.shape[0] - len(avai_time_index) == pd.isnull(missing_test).sum()['CompleteTimestamp'])

Check number of nan Time...
True


In [14]:
temp = missing_test.copy()

In [15]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1190,,2012-03-07 22:24:22,194135680.0,0.0,
1,1190,,2012-04-11 21:36:11,197156789.0,1.0,3021109.0
2,1191,Accepted-In Progress,2012-03-07 22:26:08,194135786.0,0.0,
3,1191,,2012-04-25 16:42:09,198348747.0,1.0,4212961.0
4,1192,,2012-03-08 00:06:16,194141794.0,0.0,


In [16]:
temp['TimeInterval'].median(), temp['TimeInterval'].mean()

(5968.5, 598635.98333333328)

In [17]:
median_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].median())
mean_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].mean())

In [18]:
temp['Median'] = median_imputation
temp['Mean'] = mean_imputation

In [19]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [20]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [21]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

true_time = true_time.drop(true_time.index[avai_time_index])

In [22]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))
print('\n')
print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation),
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 879973.2245 seconds | 10.1849 days
RMSE: 1597075.0141 seconds | 18.4847 days


Impute with Mean
MAE: 1650889.7465 seconds | 19.1075 days
RMSE: 2328316.8964 seconds | 26.9481 days


## Dummy 2

In [23]:
missing_groupByActivity = missing_test.groupby(['Activity'])

missing_median_duration_activity = {}
missing_mean_duration_activity = {}
missing_min_duration_activity = {}
missing_max_duration_activity = {}

for activity, group in missing_groupByActivity:
    missing_median_duration_activity[activity] = group['TimeInterval'].median()
    missing_mean_duration_activity[activity] = group['TimeInterval'].mean()
    missing_min_duration_activity[activity] = group['TimeInterval'].min()
    missing_max_duration_activity[activity] = group['TimeInterval'].max()

In [24]:
missing_mean_duration_activity

{'Accepted-Assigned': 243519.20000000001,
 'Accepted-In Progress': 497692.0,
 'Accepted-Wait': 440961.375,
 'Completed-Closed': 1087193.4318181819,
 'Queued-Awaiting Assignment': 491890.63636363635}

In [25]:
temp = missing_test.copy()

In [26]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1190,,2012-03-07 22:24:22,194135680.0,0.0,
1,1190,,2012-04-11 21:36:11,197156789.0,1.0,3021109.0
2,1191,Accepted-In Progress,2012-03-07 22:26:08,194135786.0,0.0,
3,1191,,2012-04-25 16:42:09,198348747.0,1.0,4212961.0
4,1192,,2012-03-08 00:06:16,194141794.0,0.0,


In [27]:
#Replace NaN duration with median and mean
temp['Median'] = temp['TimeInterval'].copy()
temp['Mean'] = temp['TimeInterval'].copy()

for row in range(temp.shape[0]):
    if pd.isnull(temp.CumTimeInterval.loc[row]) and pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and not pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[temp.Activity.loc[row]]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[temp.Activity.loc[row]]

In [28]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [29]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [30]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

In [31]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))

print('\n')

print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation), 
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 924564.7299 seconds | 10.7010 days
RMSE: 1600738.1950 seconds | 18.5271 days


Impute with Mean
MAE: 1606068.3253 seconds | 18.5888 days
RMSE: 2273180.8845 seconds | 26.3100 days


# Impute Activity

In [32]:
missing_activity = missing_test['Activity'].copy()
true_activity = complete_true_test['Activity'].copy()

In [33]:
avai_activity_index = []
for row in range(missing_test.shape[0]):
    if type(missing_test.Activity[row]) == str:
        avai_activity_index.append(row)

print('Check number of nan Activity...')
print(missing_test.shape[0] - len(avai_activity_index) == pd.isnull(missing_test).sum()['Activity'])

Check number of nan Activity...
True


In [34]:
def evalDummyActivity(missing_df_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index):
    
    # Impute nan
    imputed_activity = missing_activity.fillna(value=most_frequent_activity)
    
    # Drop availabel row and keep nan row
    imputed_activity = imputed_activity.drop(imputed_activity.index[avai_activity_index])
    true_activity = true_activity.drop(true_activity.index[avai_activity_index])
    
    # Check number of nan values
    print('Impute missing activities with Most frequent activity...')
    print('Accuracy: {:.2f}%'.format(accuracy_score(true_activity, imputed_activity)*100))

In [35]:
evalDummyActivity(missing_true_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index)

Impute missing activities with Most frequent activity...
Accuracy: 47.52%
