**Outline:**
- Build model and loss function
- Train model
- Observe valdidate
- Test

**To do:**
- Hyperparameter tuning
    + lr
    + layer1, layer2
    + betas

**Modification**
- Weight initialization with xavier uniform
- Adam optimization
- LR decay

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle
import time

In [2]:
import torch
import torch.utils.data
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler
from torch.autograd import Variable
from torchvision import transforms

In [3]:
sys.path.insert(0, './../utils/')
from utils import *
from models import *

In [4]:
#Define parser
#name = 'bpi_2012'
name = 'bpi_2013'
#name = 'Road_Traffic_Fine_Management_Process'  

parser = {
    'train': True,
    'test': True,
    'model_class': 'AE',
    'model_name': 'model_AE_epoch11_score1.1437.pth',
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'nan_pct': 0.3,
    'input_dir': '../input/{}/'.format(name),
    'batch_size' : 16,
    'epochs' : 20,
    'no_cuda' : False,
    'seed' : 7,
    'layer1': 1000,
    'layer2': 100,
    'lr': 0.002,
    'betas': (0.9, 0.999),   
    'lr_decay': 0.90,
}

args = argparse.Namespace(**parser)
args.output_dir = './output/{0}_{1}_{2}/'.format(name, args.nan_pct, args.model_class)

In [5]:
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

In [6]:
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [7]:
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [8]:
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}

In [9]:
preprocessed_data_name = os.path.join(args.input_dir, 'preprocessed_data_{}.pkl'.format(args.nan_pct))
with open(preprocessed_data_name, 'rb') as f:
    min_max_storage = pickle.load(f)
    complete_matrix_w_normalized_time_train = pickle.load(f)
    missing_matrix_w_normalized_time_train = pickle.load(f)
    avai_matrix_train = pickle.load(f)
    nan_matrix_train = pickle.load(f)
    complete_matrix_w_normalized_time_val = pickle.load(f)
    missing_matrix_w_normalized_time_val = pickle.load(f)
    avai_matrix_val = pickle.load(f)
    nan_matrix_val = pickle.load(f)
    pad_matrix_val = pickle.load(f)
    complete_matrix_w_normalized_time_test = pickle.load(f)
    missing_matrix_w_normalized_time_test = pickle.load(f)
    avai_matrix_test = pickle.load(f)
    nan_matrix_test = pickle.load(f)
    pad_matrix_test = pickle.load(f)
    cols_w_time = pickle.load(f)
    cols_w_normalized_time = pickle.load(f)

In [10]:
file_name = os.path.join(args.input_dir, 'parameters_{}.pkl'.format(args.nan_pct))
with open(file_name, 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

## Train

In [11]:
complete_matrix_w_normalized_time_trainLoader = torch.utils.data.DataLoader(complete_matrix_w_normalized_time_train, 
                                                                            batch_size=args.batch_size, shuffle=False, 
                                                                            num_workers=2)
missing_matrix_w_normalized_time_trainLoader = torch.utils.data.DataLoader(missing_matrix_w_normalized_time_train, 
                                                                           batch_size=args.batch_size, shuffle=False, 
                                                                           num_workers=2)
avai_matrix_trainLoader = torch.utils.data.DataLoader(avai_matrix_train, 
                                                      batch_size=args.batch_size, shuffle=False, 
                                                      num_workers=2)

## Validate and test

In [12]:
normalized_complete_df_name = os.path.join(args.input_dir, 'normalized_complete_df_{}.csv'.format(args.nan_pct))
normalized_complete_df = pd.read_csv(normalized_complete_df_name)

normalized_missing_df_name = os.path.join(args.input_dir, 'normalized_missing_df_{}.csv'.format(args.nan_pct))
normalized_missing_df = pd.read_csv(normalized_missing_df_name)

In [13]:
missing_true_val = normalized_missing_df[train_row_num:-test_row_num].reset_index(drop=True)
complete_true_val = normalized_complete_df[train_row_num:-test_row_num].reset_index(drop=True)

In [14]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [15]:
missing_true_val.shape, missing_true_test.shape

((1096, 5), (1071, 5))

In [16]:
nan_time_index_val, nan_activity_index_val = getnanindex(missing_true_val)

In [17]:
nan_time_index_test, nan_activity_index_test = getnanindex(missing_true_test)

In [18]:
pd.isnull(normalized_missing_df).sum()

CaseID                  0
Activity             1987
CompleteTimestamp    2009
CumTimeInterval      2009
NormalizedTime       2009
dtype: int64

In [19]:
pd.isnull(missing_true_val).sum()

CaseID                 0
Activity             345
CompleteTimestamp    330
CumTimeInterval      330
NormalizedTime       330
dtype: int64

In [20]:
pd.isnull(missing_true_test).sum()

CaseID                 0
Activity             315
CompleteTimestamp    304
CumTimeInterval      304
NormalizedTime       304
dtype: int64

# Build model

## Define model

In [21]:
if args.model_class == 'AE':
    model = AE_1(complete_matrix_w_normalized_time_train.shape, args.layer1, args.layer2)

if args.cuda:
    model.cuda()

## Define loss

In [22]:
# Define loss

def loss_function(recon_x, x, avai_mask):
    MSE = F.mse_loss(recon_x*avai_mask, x*avai_mask, size_average=False)
    #BCE = F.binary_cross_entropy(recon_x, x, weight=avai_mask, size_average=False) 
    return MSE

In [23]:
optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=args.betas)

In [24]:
#Adjust learning rate per epoch: http://pytorch.org/docs/master/optim.html?highlight=adam#torch.optim.Adam

# Method 1:
#lambda1 = lambda epoch: args.lr_decay ** epoch
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])

# Method 2:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10)

## Utils

In [25]:
def save_model(model, epoch, score):
    model_file = os.path.join(args.output_dir, 'model_{}_epoch{}_score{:.4f}.pth'.format(args.model_class, epoch, score))
    torch.save(model.state_dict(), model_file)

In [26]:
def load_model(model, model_name):
    model_file = os.path.join(args.output_dir, model_name)
    assert os.path.isfile(model_file), 'Error: no model found!'
    model_state = torch.load(model_file)
    model.load_state_dict(model_state)

In [27]:
def val(model, missing_matrix_w_normalized_time_val, 
        complete_matrix_w_normalized_time_val, 
        avai_matrix_val, nan_matrix_val):
    
    model.eval()
    m_val = missing_matrix_w_normalized_time_val
    m_val = Variable(torch.Tensor(m_val).float())
    
    c_val = complete_matrix_w_normalized_time_val
    c_val = Variable(torch.Tensor(c_val).float())
    
    avai_matrix_val = Variable(torch.Tensor(avai_matrix_val).float())
    nan_matrix_val = Variable(torch.Tensor(nan_matrix_val).float())
    
    if args.cuda:
        m_val = m_val.cuda()
        c_val = c_val.cuda()
        avai_matrix_val = avai_matrix_val.cuda()
        nan_matrix_val = nan_matrix_val.cuda()
        
    recon_data = model(m_val)
        
    recon = getProbability(recon_data)
    
    recon_2d = recon.view(recon.size(0)*recon.size(1), recon.size(2))
    gt_2d = c_val.view(recon.size(0)*recon.size(1), recon.size(2))
    avai_matrix_2d = avai_matrix_val.view(recon.size(0)*recon.size(1), recon.size(2))
    nan_matrix_2d = nan_matrix_val.view(recon.size(0)*recon.size(1), recon.size(2))
    
    submission = torch.add(recon_2d*nan_matrix_2d, gt_2d*avai_matrix_2d)
    
    #Time
    predicted_time = submission.data[:, 0].cpu().numpy()
    gt_time = gt_2d.data[:, 0].cpu().numpy()
    mae = mean_absolute_error(gt_time, predicted_time)
    
    #Activity
    predicted_act = submission.data[:, 1:].cpu().numpy()
    gt_act = gt_2d.data[:, 1:].cpu().numpy()
    
    a = predicted_act[np.any(predicted_act != 0, axis=1)]
    b = gt_act[np.any(gt_act != 0, axis=1)]   
    
    predicted_labels = np.argmax(a, axis=1)
    gt_labels = np.argmax(b, axis=1)
    acc = accuracy_score(gt_labels, predicted_labels)
    
    return mae+1/acc

In [28]:
missing_true_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,893,Queued-Awaiting Assignment,2012-01-14 04:03:40,189490438.0,0.0
1,893,,,,
2,893,,2012-01-31 22:29:08,191025566.0,1.0
3,893,Accepted-In Progress,,,
4,893,,,,


In [29]:
complete_true_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,893,Queued-Awaiting Assignment,2012-01-14 04:03:40,189490438.0,0.0
1,893,Accepted-In Progress,2012-01-31 22:16:55,191024833.0,0.507308
2,893,Queued-Awaiting Assignment,2012-01-31 22:29:08,191025566.0,0.507551
3,893,Accepted-In Progress,2012-01-31 23:02:59,191027597.0,0.508222
4,893,Completed-Closed,2012-02-18 04:13:20,192515018.0,1.0


# Train model

In [30]:
def train(epoch, model, optimizer):
    model.train()
    train_loss = 0
    for batch_idx, (m_data, c_data, avai_mask) in enumerate(zip(missing_matrix_w_normalized_time_trainLoader, 
                                                                complete_matrix_w_normalized_time_trainLoader,
                                                                avai_matrix_trainLoader)):
        c_data = Variable(c_data.float())
        m_data = Variable(m_data.float())
        avai_mask = Variable(avai_mask.float())

        if args.cuda:
            c_data = c_data.cuda()
            m_data = m_data.cuda()
            avai_mask = avai_mask.cuda()

            
        optimizer.zero_grad()
        
        recon_data = model(m_data)
        
        loss = loss_function(recon_data, c_data, avai_mask)
        
        loss.backward()
        train_loss += loss.data[0]
        optimizer.step()
        
    return train_loss / len(complete_matrix_w_normalized_time_trainLoader.dataset)

In [31]:
if args.train:
    for epoch in range(1, args.epochs + 1):
        init = time.time()
        
        #method 1 scheduler
        #scheduler.step()
        train_loss = train(epoch, model, optimizer)
        end_train = time.time()
        
        val_score = val(model, missing_matrix_w_normalized_time_val, complete_matrix_w_normalized_time_val, 
                        avai_matrix_val, nan_matrix_val)
        save_model(model, epoch, val_score)
        
        '''
        if epoch == 1:
            current_best = val_score
            save_model(model, epoch, val_score)
        
        else:
            if val_score < current_best:
                current_best = val_score
                save_model(model, epoch, val_score)
        '''
        
        #method 2 scheduler
        scheduler.step(val_score)
        
        end = time.time()
        print('====> Epoch {} | Train time: {:.4f} ms| End time: {:.4f} ms | Train loss: {:.4f} | Val score: {:.4f}'.
              format(epoch, (end_train-init)*1000, (end-init)*1000, train_loss, val_score))
else:
    load_model(model, args.model_name)

====> Epoch 1 | Train time: 1044.3749 ms| End time: 1304.9860 ms | Train loss: 1.7358 | Val score: 1.1113
====> Epoch 2 | Train time: 1164.9001 ms| End time: 1393.0230 ms | Train loss: 0.7909 | Val score: 1.1199
====> Epoch 3 | Train time: 1066.3290 ms| End time: 1222.6150 ms | Train loss: 0.4779 | Val score: 1.1226
====> Epoch 4 | Train time: 876.8201 ms| End time: 1002.1620 ms | Train loss: 0.3040 | Val score: 1.1012
====> Epoch 5 | Train time: 795.4371 ms| End time: 942.9579 ms | Train loss: 0.2186 | Val score: 1.1033
====> Epoch 6 | Train time: 785.6879 ms| End time: 904.5229 ms | Train loss: 0.1705 | Val score: 1.1200
====> Epoch 7 | Train time: 802.4929 ms| End time: 959.2369 ms | Train loss: 0.1536 | Val score: 1.1066
====> Epoch 8 | Train time: 813.0341 ms| End time: 974.9999 ms | Train loss: 0.1297 | Val score: 1.1170
====> Epoch 9 | Train time: 918.8330 ms| End time: 1092.1729 ms | Train loss: 0.1165 | Val score: 1.1375
====> Epoch 10 | Train time: 928.7171 ms| End time: 1097

# Predict and evaluate

In [32]:
if args.test:
    m_test = missing_matrix_w_normalized_time_test
    m_test = Variable(torch.Tensor(m_test).float())
    
    if args.cuda:
        m_test = m_test.cuda()
    
    print('Predicting...')
    recon_test = model(m_test)
    
    print('\n')
    print('Converting to dataframe...')
    recon_df_w_normalized_time = convert2df(recon_test, pad_matrix_test, cols_w_normalized_time, test_row_num)
    
    print('Transforming Normalized Time to Time...')
    recon_df_w_time = getDfWithTime(recon_df_w_normalized_time, missing_true_test, min_max_storage)
    
    print('Getting submission...')
    submission_df = getSubmission(recon_df_w_time, missing_true_test, complete_true_test, first_timestamp)
    submission = fixTime(submission_df)
    
    print('Testing...')
    mae_time, rmse_time, acc = evaluation(submission, nan_time_index_test, nan_activity_index_test,show=True)
    print('\n')
    
    print('Saving submission...')
    submission_df.to_csv(args.output_dir+'submission.csv', index=False)
    print('Done!')

Predicting...


Converting to dataframe...
Transforming Normalized Time to Time...
Getting submission...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Testing...
Number of missing Time: 304
Mean Absolute Error: 6.7502 day(s)
Root Mean Squared Error: 14.1284 day(s)
Number of missing Activity: 315
Accuracy: 71.11%


Saving submission...
Done!


In [33]:
submission_df.head(10)

Unnamed: 0,CaseID,TrueActivity,PredictedActivity,TrueTime,PredictedTime,TrueCompleteTimestamp,PredictedCompleteTimestamp
0,1190,Accepted-In Progress,Accepted-In Progress,194135680.0,194135700.0,2012-03-07 22:24:22,2012-03-07 22:24:22
1,1190,Completed-Closed,Completed-Closed,197156789.0,197156800.0,2012-04-11 21:36:11,2012-04-11 21:36:11
2,1191,Accepted-In Progress,Accepted-In Progress,194135786.0,194135800.0,2012-03-07 22:26:08,2012-03-07 22:26:08
3,1191,Completed-Closed,Completed-Closed,198348747.0,198348700.0,2012-04-25 16:42:09,2012-04-25 16:42:09
4,1192,Accepted-In Progress,Accepted-In Progress,194141794.0,194158400.0,2012-03-08 00:06:16,2012-03-08 04:43:19.014151
5,1192,Completed-Closed,Completed-Closed,194821332.0,198667100.0,2012-03-15 20:51:54,2012-04-29 09:08:39.291612
6,1193,Accepted-In Progress,Queued-Awaiting Assignment,194163235.0,194163200.0,2012-03-08 06:03:37,2012-03-08 06:03:37
7,1193,Accepted-In Progress,Accepted-In Progress,194319293.0,194319300.0,2012-03-10 01:24:35,2012-03-10 01:24:35
8,1193,Accepted-Wait,Accepted-Wait,195953439.0,195953400.0,2012-03-28 23:20:21,2012-03-28 23:20:21
9,1193,Accepted-In Progress,Accepted-In Progress,199039446.0,199039400.0,2012-05-03 16:33:48,2012-05-03 16:33:48


In [34]:
submission.head(10)

Unnamed: 0,CaseID,TrueActivity,PredictedActivity,TrueTime,PredictedTime,TrueCompleteTimestamp,PredictedCompleteTimestamp
0,1190,Accepted-In Progress,Accepted-In Progress,194135680.0,194135700.0,2012-03-07 22:24:22,2012-03-07 22:24:22
1,1190,Completed-Closed,Completed-Closed,197156789.0,197156800.0,2012-04-11 21:36:11,2012-04-11 21:36:11
2,1191,Accepted-In Progress,Accepted-In Progress,194135786.0,194135800.0,2012-03-07 22:26:08,2012-03-07 22:26:08
3,1191,Completed-Closed,Completed-Closed,198348747.0,198348700.0,2012-04-25 16:42:09,2012-04-25 16:42:09
4,1192,Accepted-In Progress,Accepted-In Progress,194141794.0,194158400.0,2012-03-08 00:06:16,2012-03-08 04:43:19.014151
5,1192,Completed-Closed,Completed-Closed,194821332.0,198667100.0,2012-03-15 20:51:54,2012-04-29 09:08:39.291612
6,1193,Accepted-In Progress,Queued-Awaiting Assignment,194163235.0,194163200.0,2012-03-08 06:03:37,2012-03-08 06:03:37
7,1193,Accepted-In Progress,Accepted-In Progress,194319293.0,194319300.0,2012-03-10 01:24:35,2012-03-10 01:24:35
8,1193,Accepted-Wait,Accepted-Wait,195953439.0,195953400.0,2012-03-28 23:20:21,2012-03-28 23:20:21
9,1193,Accepted-In Progress,Accepted-In Progress,199039446.0,199039400.0,2012-05-03 16:33:48,2012-05-03 16:33:48
