**Outline:**
- Impute Time
    + Dummy 1 to impute Time
    + Dummy 2 to impute Time
- Impute Activity

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle
from math import sqrt

In [2]:
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, log_loss

In [3]:
sys.path.insert(0, './../utils/')
from utils import *

In [4]:
#Define parser
name = 'bpi_2012'
#name = 'bpi_2013'

parser = {
    'data_file': name + '.csv',
    'nan_pct': 0.5,
    'input_dir': '../input/{}/'.format(name), 
    'output_dir': './output/{}/'.format(name),
}

args = argparse.Namespace(**parser)

In [5]:
file_name = os.path.join(args.input_dir, 'parameters_{}.pkl'.format(args.nan_pct))
with open(file_name, 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

In [6]:
normalized_complete_df_name = os.path.join(args.input_dir, 'normalized_complete_df_{}.csv'.format(args.nan_pct))
normalized_complete_df = pd.read_csv(normalized_complete_df_name)

normalized_missing_df_name = os.path.join(args.input_dir, 'normalized_missing_df_{}.csv'.format(args.nan_pct))
normalized_missing_df = pd.read_csv(normalized_missing_df_name)

In [7]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [8]:
missing_true_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,10470,,2012-02-04 01:17:11.047,10863510.0,0.0
1,10470,,,,
2,10470,A_DECLINED-COMPLETE,2012-02-04 01:17:42.964,10863540.0,0.996877
3,10471,,,,
4,10471,A_PARTLYSUBMITTED-COMPLETE,2012-02-04 01:23:42.504,10863900.0,0.0


In [9]:
missing_test = missing_true_test.copy()
missing_test['CompleteTimestamp'] = pd.to_datetime(missing_test['CompleteTimestamp'])
missing_test = calculateTimeInterval(missing_test)
missing_test['TimeInterval'] = missing_test['TimeInterval'].apply(convert2seconds)

In [10]:
#Replace duration of starting activity with nan
for row in range(missing_test.shape[0]):
    if missing_test['CaseID'].iloc[row] != missing_test['CaseID'].iloc[row-1]:
        missing_test.loc[row, 'TimeInterval'] = np.nan

In [11]:
pd.isnull(missing_test).sum()

CaseID                   0
Activity             23936
CompleteTimestamp    23902
CumTimeInterval      23902
NormalizedTime       23902
TimeInterval         36455
dtype: int64

In [12]:
normalized_complete_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1,A_SUBMITTED-COMPLETE,2011-10-01 07:38:44.546,0.0,0.0
1,1,A_PARTLYSUBMITTED-COMPLETE,2011-10-01 07:38:44.880,0.334,3.113544e-07
2,1,A_PREACCEPTED-COMPLETE,2011-10-01 07:39:37.906,53.36,4.974213e-05
3,1,W_Completeren aanvraag-SCHEDULE,2011-10-01 07:39:38.875,54.329,5.064543e-05
4,1,W_Completeren aanvraag-START,2011-10-01 18:36:46.437,39481.891,0.03680497


# Impute Time

## Dummy 1

In [13]:
missing_time = missing_test['TimeInterval']
true_time = complete_true_test['CumTimeInterval']

In [14]:
avai_time_index = []
for row in range(missing_test.shape[0]):
    if not pd.isnull(missing_test.CompleteTimestamp[row]):
        avai_time_index.append(row)

print('Check number of nan Time...')
print(missing_test.shape[0] - len(avai_time_index) == pd.isnull(missing_test).sum()['CompleteTimestamp'])

Check number of nan Time...
True


In [15]:
temp = missing_test.copy()

In [16]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,10470,,2012-02-04 01:17:11.047,10863510.0,0.0,
1,10470,,NaT,,,
2,10470,A_DECLINED-COMPLETE,2012-02-04 01:17:42.964,10863540.0,0.996877,
3,10471,,NaT,,,
4,10471,A_PARTLYSUBMITTED-COMPLETE,2012-02-04 01:23:42.504,10863900.0,0.0,


In [17]:
temp['TimeInterval'].median(), temp['TimeInterval'].mean()

(43.090500000000006, 35549.77530031668)

In [18]:
median_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].median())
mean_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].mean())

In [19]:
temp['Median'] = median_imputation
temp['Mean'] = mean_imputation

In [20]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [21]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [22]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

true_time = true_time.drop(true_time.index[avai_time_index])

In [23]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))
print('\n')
print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation),
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 123700.8689 seconds | 1.4317 days
RMSE: 370784.0128 seconds | 4.2915 days


Impute with Mean
MAE: 160058.4219 seconds | 1.8525 days
RMSE: 381754.2088 seconds | 4.4185 days


## Dummy 2

In [24]:
missing_groupByActivity = missing_test.groupby(['Activity'])

missing_median_duration_activity = {}
missing_mean_duration_activity = {}
missing_min_duration_activity = {}
missing_max_duration_activity = {}

for activity, group in missing_groupByActivity:
    missing_median_duration_activity[activity] = group['TimeInterval'].median()
    missing_mean_duration_activity[activity] = group['TimeInterval'].mean()
    missing_min_duration_activity[activity] = group['TimeInterval'].min()
    missing_max_duration_activity[activity] = group['TimeInterval'].max()

In [25]:
temp = missing_test.copy()

In [26]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,10470,,2012-02-04 01:17:11.047,10863510.0,0.0,
1,10470,,NaT,,,
2,10470,A_DECLINED-COMPLETE,2012-02-04 01:17:42.964,10863540.0,0.996877,
3,10471,,NaT,,,
4,10471,A_PARTLYSUBMITTED-COMPLETE,2012-02-04 01:23:42.504,10863900.0,0.0,


In [27]:
#Replace NaN duration with median and mean
temp['Median'] = temp['TimeInterval'].copy()
temp['Mean'] = temp['TimeInterval'].copy()

for row in range(temp.shape[0]):
    if not pd.isnull(temp.Activity.loc[row]) and pd.isnull(missing_median_duration_activity[temp.Activity.loc[row]]): 
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and not pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[temp.Activity.loc[row]]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[temp.Activity.loc[row]]

In [28]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval,Median,Mean
0,10470,,2012-02-04 01:17:11.047,10863510.0,0.0,,,
1,10470,,NaT,,,,56.724,4992.785531
2,10470,A_DECLINED-COMPLETE,2012-02-04 01:17:42.964,10863540.0,0.996877,,,
3,10471,,NaT,,,,56.724,4992.785531
4,10471,A_PARTLYSUBMITTED-COMPLETE,2012-02-04 01:23:42.504,10863900.0,0.0,,,


In [29]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [30]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [31]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

In [32]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval,Median,Mean,CumTimeInterval_Median,CumTimeInterval_Mean
0,10470,,2012-02-04 01:17:11.047,10863510.0,0.0,,,,10863510.0,10863510.0
1,10470,,NaT,,,,56.724,4992.785531,10863560.0,10868500.0
2,10470,A_DECLINED-COMPLETE,2012-02-04 01:17:42.964,10863540.0,0.996877,,,,10863540.0,10863540.0
3,10471,,NaT,,,,56.724,4992.785531,10863600.0,10868530.0
4,10471,A_PARTLYSUBMITTED-COMPLETE,2012-02-04 01:23:42.504,10863900.0,0.0,,,,10863900.0,10863900.0


In [33]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))

print('\n')

print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation), 
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 120054.7370 seconds | 1.3895 days
RMSE: 362400.6705 seconds | 4.1945 days


Impute with Mean
MAE: 123827.9001 seconds | 1.4332 days
RMSE: 361315.8385 seconds | 4.1819 days


# Impute Activity

In [34]:
missing_activity = missing_test['Activity'].copy()
true_activity = complete_true_test['Activity'].copy()

In [35]:
avai_activity_index = []
for row in range(missing_test.shape[0]):
    if type(missing_test.Activity[row]) == str:
        avai_activity_index.append(row)

print('Check number of nan Activity...')
print(missing_test.shape[0] - len(avai_activity_index) == pd.isnull(missing_test).sum()['Activity'])

Check number of nan Activity...
True


In [36]:
def evalDummyActivity(missing_df_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index):
    
    # Impute nan
    imputed_activity = missing_activity.fillna(value=most_frequent_activity)
    
    # Drop availabel row and keep nan row
    imputed_activity = imputed_activity.drop(imputed_activity.index[avai_activity_index])
    true_activity = true_activity.drop(true_activity.index[avai_activity_index])
    
    # Check number of nan values
    print('Impute missing activities with Most frequent activity...')
    print('Accuracy: {:.2f}%'.format(accuracy_score(true_activity, imputed_activity)*100))

In [37]:
evalDummyActivity(missing_true_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index)

Impute missing activities with Most frequent activity...
Accuracy: 10.17%
