**Outline:**
- Impute Time
    + Dummy 1 to impute Time
    + Dummy 2 to impute Time
- Impute Activity

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle
from math import sqrt

In [2]:
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, log_loss

In [3]:
sys.path.insert(0, './../utils/')
from utils import *

In [4]:
#Define parser
#name = 'bpi_2012'
#name = 'bpi_2013'
name = 'small_log'
#name = 'large_log'

parser = {
    'data_file': name + '.csv',
    'nan_pct': 0.5,
    'input_dir': '../input/{}/'.format(name), 
    'output_dir': './output/{}/'.format(name),
}

args = argparse.Namespace(**parser)

In [5]:
file_name = os.path.join(args.input_dir, 'parameters_{}.pkl'.format(args.nan_pct))
with open(file_name, 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

In [6]:
normalized_complete_df_name = os.path.join(args.input_dir, 'normalized_complete_df_{}.csv'.format(args.nan_pct))
normalized_complete_df = pd.read_csv(normalized_complete_df_name)

normalized_missing_df_name = os.path.join(args.input_dir, 'normalized_missing_df_{}.csv'.format(args.nan_pct))
normalized_missing_df = pd.read_csv(normalized_missing_df_name)

In [7]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [8]:
missing_true_test.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1601,,1970-01-01 09:00:00,0.0,0.0
1,1601,Activity B,,,
2,1601,Activity C,,,
3,1601,,1970-01-01 12:00:00,10800.0,0.42857
4,1601,,1970-01-01 13:00:00,14400.0,0.571426


In [9]:
missing_test = missing_true_test.copy()
missing_test['CompleteTimestamp'] = pd.to_datetime(missing_test['CompleteTimestamp'])
missing_test = calculateTimeInterval(missing_test)
missing_test['TimeInterval'] = missing_test['TimeInterval'].apply(convert2seconds)

In [10]:
#Replace duration of starting activity with nan
for row in range(missing_test.shape[0]):
    if missing_test['CaseID'].iloc[row] != missing_test['CaseID'].iloc[row-1]:
        missing_test.loc[row, 'TimeInterval'] = np.nan

In [11]:
pd.isnull(missing_test).sum()

CaseID                  0
Activity             2780
CompleteTimestamp    2735
CumTimeInterval      2735
NormalizedTime       2735
TimeInterval         4248
dtype: int64

In [12]:
normalized_complete_df.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1,Activity A,1970-01-01 09:00:00,0.0,0.0
1,1,Activity B,1970-01-01 10:00:00,3600.0,0.125
2,1,Activity C,1970-01-01 11:00:00,7200.0,0.249999
3,1,Activity D,1970-01-01 12:00:00,10800.0,0.374999
4,1,Activity E,1970-01-01 13:00:00,14400.0,0.499998


# Impute Time

## Dummy 1

In [13]:
missing_time = missing_test['TimeInterval']
true_time = complete_true_test['CumTimeInterval']

In [14]:
avai_time_index = []
for row in range(missing_test.shape[0]):
    if not pd.isnull(missing_test.CompleteTimestamp[row]):
        avai_time_index.append(row)

print('Check number of nan Time...')
print(missing_test.shape[0] - len(avai_time_index) == pd.isnull(missing_test).sum()['CompleteTimestamp'])

Check number of nan Time...
True


In [15]:
temp = missing_test.copy()

In [16]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1601,,1970-01-01 09:00:00,0.0,0.0,
1,1601,Activity B,NaT,,,
2,1601,Activity C,NaT,,,
3,1601,,1970-01-01 12:00:00,10800.0,0.42857,
4,1601,,1970-01-01 13:00:00,14400.0,0.571426,3600.0


In [17]:
temp['TimeInterval'].median(), temp['TimeInterval'].mean()

(3600.0, 2226.0355029585799)

In [18]:
median_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].median())
mean_imputation = temp['TimeInterval'].fillna(value=temp['TimeInterval'].mean())

In [19]:
temp['Median'] = median_imputation
temp['Mean'] = mean_imputation

In [20]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [21]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [22]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

true_time = true_time.drop(true_time.index[avai_time_index])

In [23]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))
print('\n')
print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation),
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 6741.9378 seconds | 0.0780 days
RMSE: 12703.7929 seconds | 0.1470 days


Impute with Mean
MAE: 5854.1095 seconds | 0.0678 days
RMSE: 10929.9638 seconds | 0.1265 days


## Dummy 2

In [24]:
missing_groupByActivity = missing_test.groupby(['Activity'])

missing_median_duration_activity = {}
missing_mean_duration_activity = {}
missing_min_duration_activity = {}
missing_max_duration_activity = {}

for activity, group in missing_groupByActivity:
    missing_median_duration_activity[activity] = group['TimeInterval'].median()
    missing_mean_duration_activity[activity] = group['TimeInterval'].mean()
    missing_min_duration_activity[activity] = group['TimeInterval'].min()
    missing_max_duration_activity[activity] = group['TimeInterval'].max()

In [25]:
temp = missing_test.copy()

In [26]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval
0,1601,,1970-01-01 09:00:00,0.0,0.0,
1,1601,Activity B,NaT,,,
2,1601,Activity C,NaT,,,
3,1601,,1970-01-01 12:00:00,10800.0,0.42857,
4,1601,,1970-01-01 13:00:00,14400.0,0.571426,3600.0


In [27]:
#Replace NaN duration with median and mean
temp['Median'] = temp['TimeInterval'].copy()
temp['Mean'] = temp['TimeInterval'].copy()

for row in range(temp.shape[0]):
    if not pd.isnull(temp.Activity.loc[row]) and pd.isnull(missing_median_duration_activity[temp.Activity.loc[row]]): 
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[most_frequent_activity]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[most_frequent_activity]
    elif pd.isnull(temp.CumTimeInterval.loc[row]) and not pd.isnull(temp.Activity.loc[row]):
        temp.loc[row, 'Median'] = missing_median_duration_activity[temp.Activity.loc[row]]
        temp.loc[row, 'Mean'] = missing_mean_duration_activity[temp.Activity.loc[row]]

In [28]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval,Median,Mean
0,1601,,1970-01-01 09:00:00,0.0,0.0,,,
1,1601,Activity B,NaT,,,,3600.0,3600.0
2,1601,Activity C,NaT,,,,3600.0,3600.0
3,1601,,1970-01-01 12:00:00,10800.0,0.42857,,,
4,1601,,1970-01-01 13:00:00,14400.0,0.571426,3600.0,3600.0,3600.0


In [29]:
current_CumMedian = 0
current_CumMean = 0

CumTimeInterval_Median = []
CumTimeInterval_Mean = []

for row in range(temp.shape[0]):
    if not pd.isnull(temp.CumTimeInterval[row]):
        current_CumMedian = temp.CumTimeInterval[row]
        current_CumMean =temp.CumTimeInterval[row]
    else:
        current_CumMedian += temp.Median[row]
        current_CumMean += temp.Mean[row]
        
    CumTimeInterval_Median.append(current_CumMedian)
    CumTimeInterval_Mean.append(current_CumMean)

In [30]:
temp['CumTimeInterval_Median'] = CumTimeInterval_Median
temp['CumTimeInterval_Mean'] = CumTimeInterval_Mean

In [31]:
median_imputation = temp['CumTimeInterval_Median'].drop(temp['CumTimeInterval_Median'].index[avai_time_index])
mean_imputation = temp['CumTimeInterval_Mean'].drop(temp['CumTimeInterval_Mean'].index[avai_time_index])

In [32]:
temp.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime,TimeInterval,Median,Mean,CumTimeInterval_Median,CumTimeInterval_Mean
0,1601,,1970-01-01 09:00:00,0.0,0.0,,,,0.0,0.0
1,1601,Activity B,NaT,,,,3600.0,3600.0,3600.0,3600.0
2,1601,Activity C,NaT,,,,3600.0,3600.0,7200.0,7200.0
3,1601,,1970-01-01 12:00:00,10800.0,0.42857,,,,10800.0,10800.0
4,1601,,1970-01-01 13:00:00,14400.0,0.571426,3600.0,3600.0,3600.0,14400.0,14400.0


In [33]:
print('Impute with Median')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, median_imputation), 
                                                 mean_absolute_error(true_time, median_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, median_imputation)),
                                                  sqrt(mean_squared_error(true_time, median_imputation))/86400))

print('\n')

print('Impute with Mean')
print('MAE: {:.4f} seconds | {:.4f} days'.format(mean_absolute_error(true_time, mean_imputation), 
                                                 mean_absolute_error(true_time, mean_imputation)/86400))
print('RMSE: {:.4f} seconds | {:.4f} days'.format(sqrt(mean_squared_error(true_time, mean_imputation)),
                                                  sqrt(mean_squared_error(true_time, mean_imputation))/86400))

Impute with Median
MAE: 5517.8062 seconds | 0.0639 days
RMSE: 12048.6910 seconds | 0.1395 days


Impute with Mean
MAE: 5755.4955 seconds | 0.0666 days
RMSE: 12133.3601 seconds | 0.1404 days


# Impute Activity

In [34]:
missing_activity = missing_test['Activity'].copy()
true_activity = complete_true_test['Activity'].copy()

In [35]:
avai_activity_index = []
for row in range(missing_test.shape[0]):
    if type(missing_test.Activity[row]) == str:
        avai_activity_index.append(row)

print('Check number of nan Activity...')
print(missing_test.shape[0] - len(avai_activity_index) == pd.isnull(missing_test).sum()['Activity'])

Check number of nan Activity...
True


In [36]:
def evalDummyActivity(missing_df_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index):
    
    # Impute nan
    imputed_activity = missing_activity.fillna(value=most_frequent_activity)
    
    # Drop availabel row and keep nan row
    imputed_activity = imputed_activity.drop(imputed_activity.index[avai_activity_index])
    true_activity = true_activity.drop(true_activity.index[avai_activity_index])
    
    # Check number of nan values
    print('Impute missing activities with Most frequent activity...')
    print('Accuracy: {:.2f}%'.format(accuracy_score(true_activity, imputed_activity)*100))

In [37]:
evalDummyActivity(missing_true_test, true_activity, missing_activity, most_frequent_activity, avai_activity_index)

Impute missing activities with Most frequent activity...
Accuracy: 7.12%
