**Outline:**
- Build model and loss function
- Train model
- Observe valdidate
- Test

**To do:**
- Hyperparameter tuning
    + lr
    + layer1, layer2
    + betas

**Modification**
- Weight initialization with xavier uniform
- Adam optimization
- LR decay

In [1]:
import importlib
import argparse
import os, sys
import argparse
import pandas as pd
import numpy as np
import pickle

In [2]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler
from torch.autograd import Variable
from torchvision import transforms

In [3]:
sys.path.insert(0, './../utils/')
from utils import *
from models import *

In [4]:
#Define parser
#name = 'bpi_2012'
name = 'bpi_2013'
#name = 'helpdesk'  

parser = {
    'train': True,
    'test': True,
    'model_class': 'AE',
    'model_name': '',
    'data_dir': '../data/',
    'data_file': name + '.csv',
    'input_dir': '../input/{}/'.format(name), 
    'output_dir': './output/{}/'.format(name),
    'batch_size' : 16,
    'epochs' : 10,
    'no_cuda' : False,
    'seed' : 7,
    'layer1': 800,
    'layer2': 100,
    'lr': 0.001,
    'betas': (0.9, 0.999),   
    'lr_decay': 0.95,
}

args = argparse.Namespace(**parser)

In [5]:
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

In [6]:
args.cuda = not args.no_cuda and torch.cuda.is_available()

In [7]:
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [8]:
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}

In [9]:
with open(args.input_dir + 'preprocessed_data.pkl', 'rb') as f:
    min_max_storage = pickle.load(f)
    complete_matrix_w_normalized_time_train = pickle.load(f)
    missing_matrix_w_normalized_time_train = pickle.load(f)
    avai_matrix_train = pickle.load(f)
    nan_matrix_train = pickle.load(f)
    complete_matrix_w_normalized_time_val = pickle.load(f)
    missing_matrix_w_normalized_time_val = pickle.load(f)
    avai_matrix_val = pickle.load(f)
    nan_matrix_val = pickle.load(f)
    pad_matrix_val = pickle.load(f)
    complete_matrix_w_normalized_time_test = pickle.load(f)
    missing_matrix_w_normalized_time_test = pickle.load(f)
    avai_matrix_test = pickle.load(f)
    nan_matrix_test = pickle.load(f)
    pad_matrix_test = pickle.load(f)
    cols_w_time = pickle.load(f)
    cols_w_normalized_time = pickle.load(f)

In [10]:
with open(args.input_dir + 'parameters.pkl', 'rb') as f:
    most_frequent_activity = pickle.load(f)
    first_timestamp = pickle.load(f)
    avai_instance = pickle.load(f)
    nan_instance = pickle.load(f)
    train_size = pickle.load(f)
    val_size = pickle.load(f)
    test_size = pickle.load(f)
    train_row_num = pickle.load(f)
    val_row_num = pickle.load(f)
    test_row_num = pickle.load(f)

# Load data

## Train

In [11]:
complete_matrix_w_normalized_time_trainLoader = torch.utils.data.DataLoader(complete_matrix_w_normalized_time_train, batch_size=args.batch_size, shuffle=False, num_workers=2)
missing_matrix_w_normalized_time_trainLoader = torch.utils.data.DataLoader(missing_matrix_w_normalized_time_train, batch_size=args.batch_size, shuffle=False, num_workers=2)

## Validate and test

In [12]:
normalized_complete_df = pd.read_csv(args.input_dir+'normalized_complete_df.csv')
normalized_missing_df = pd.read_csv(args.input_dir+'normalized_missing_df.csv')

In [13]:
missing_true_val = normalized_missing_df[train_row_num:-test_row_num].reset_index(drop=True)
complete_true_val = normalized_complete_df[train_row_num:-test_row_num].reset_index(drop=True)

In [14]:
missing_true_test = normalized_missing_df[-test_row_num:].reset_index(drop=True)
complete_true_test = normalized_complete_df[-test_row_num:].reset_index(drop=True)

In [15]:
missing_true_val.shape, missing_true_test.shape

((601, 5), (1080, 5))

In [16]:
nan_time_index_val, nan_activity_index_val = getnanindex(missing_true_val)

In [17]:
nan_time_index_test, nan_activity_index_test = getnanindex(missing_true_test)

# Build model

## Define model

In [18]:
if args.model_class == 'AE':
    model = AE(complete_matrix_w_normalized_time_train.shape, args.layer1, args.layer2)

if args.cuda:
    model.cuda()

## Define loss

In [19]:
# Define loss
recon_function = nn.BCELoss()
recon_function.size_average = False #loss sum of each mini-batch

def loss_function(recon_x, x):
    BCE = recon_function(recon_x, x)  
    return BCE

In [20]:
optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=args.betas)

In [21]:
#Adjust learning rate per epoch: http://pytorch.org/docs/master/optim.html?highlight=adam#torch.optim.Adam
lambda1 = lambda epoch: args.lr_decay ** epoch
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1])

## Utils

In [22]:
def save_model(model, epoch, score):
    model_file = os.path.join(args.output_dir, 'model_{}_epoch{}_score{:.4f}.pth'.format(args.model_class, epoch, score))
    torch.save(model.state_dict(), model_file)

In [23]:
def load_model(model, model_name):
    model_file = os.path.join(args.output_dir, model_name)
    assert os.path.isfile(model_file), 'Error: no model found!'
    model_state = torch.load(model_file)
    model.load_state_dict(model_state)

In [24]:
def val(model, missing_matrix_w_normalized_time_val, complete_true_val, missing_true_val,
       pad_matrix_val, cols_w_normalized_time, val_row_num,
       nan_time_index_val, nan_activity_index_val):
    model.eval()
    m_val = missing_matrix_w_normalized_time_val
    m_val = Variable(torch.Tensor(m_val).float())
    
    if args.cuda:
        m_val = m_val.cuda()
        
    recon_val = model(m_val)
    
    recon_df_w_normalized_time = convert2df(recon_val, pad_matrix_val, cols_w_normalized_time, val_row_num)
    recon_df_w_time = getDfWithTime(recon_df_w_normalized_time, missing_true_val, min_max_storage)
    submission_df = getSubmission(recon_df_w_time, missing_true_val, complete_true_val, first_timestamp)
    
    #evaluate
    time, acc = evaluation(submission_df, nan_time_index_val, nan_activity_index_val)
    
    
    return time/86400+1/acc

In [25]:
missing_true_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1041,Accepted-In Progress,,,
1,1041,Completed-Closed,,,
2,1042,,,,
3,1042,,,,
4,1042,Accepted-In Progress,2012-02-13 03:52:59,192081797.0,0.0


In [26]:
complete_true_val.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,CumTimeInterval,NormalizedTime
0,1041,Accepted-In Progress,2012-02-10 18:12:35,191874173.0,0.0
1,1041,Completed-Closed,2012-03-12 18:15:30,194552748.0,1.0
2,1042,Accepted-In Progress,2012-02-12 04:41:24,191998302.0,0.0
3,1042,Accepted-Wait,2012-02-12 05:37:34,192001672.0,0.001709
4,1042,Accepted-In Progress,2012-02-13 03:52:59,192081797.0,0.042338


# Train model

In [27]:
def train(epoch, model, optimizer):
    model.train()
    train_loss = 0
    for batch_idx, (m_data, c_data) in enumerate(zip(missing_matrix_w_normalized_time_trainLoader, complete_matrix_w_normalized_time_trainLoader)):
        c_data = Variable(c_data.float())
        m_data = Variable(m_data.float())
      
        #Transform: np --> Tensor/Variable: tensor --> tensor with wrapper
        #Wraps a tensor and records the operations applied to it.
        #Variable is a thin wrapper around a Tensor object, that also holds the gradient
        if args.cuda:
            c_data = c_data.cuda()
            m_data = m_data.cuda()
            
        optimizer.zero_grad()
        
        recon_data = model(m_data)
        
        loss = loss_function(recon_data, c_data)
        loss.backward()
        train_loss += loss.data[0]
        optimizer.step()
        
    return train_loss / len(complete_matrix_w_normalized_time_trainLoader.dataset)

In [28]:
if args.train:
    for epoch in range(1, args.epochs + 1):
        scheduler.step()
        train_loss = train(epoch, model, optimizer)
        val_score = val(model, missing_matrix_w_normalized_time_val, complete_true_val, missing_true_val,
                            pad_matrix_val, cols_w_normalized_time, val_row_num, 
                            nan_time_index_val, nan_activity_index_val)
        
        # Track performance of each epoch
        #print('====> Epoch {} | Train loss: {:.4f}'.format(
        #      epoch, train_loss))
        print('====> Epoch {} | Train loss: {:.4f} | Val score: {:.4f}'.format(
              epoch, train_loss, val_score))
        if val_score < 5.2:
            save_model(model, epoch, val_score)
else:
    load_model(model, args.model_name)

====> Epoch 1 | Train loss: 24.0077 | Val score: 21.0745
====> Epoch 2 | Train loss: 11.1873 | Val score: 20.5155
====> Epoch 3 | Train loss: 9.7809 | Val score: 19.5338
====> Epoch 4 | Train loss: 8.5812 | Val score: 19.2014
====> Epoch 5 | Train loss: 8.1589 | Val score: 19.1084
====> Epoch 6 | Train loss: 7.8844 | Val score: 19.0672
====> Epoch 7 | Train loss: 7.6200 | Val score: 19.1345
====> Epoch 8 | Train loss: 7.3230 | Val score: 19.1561
====> Epoch 9 | Train loss: 7.0284 | Val score: 19.1905
====> Epoch 10 | Train loss: 6.7701 | Val score: 19.2032


# Predict and evaluate

In [29]:
if args.test:
    m_test = missing_matrix_w_normalized_time_test
    m_test = Variable(torch.Tensor(m_test).float())
    
    if args.cuda:
        m_test = m_test.cuda()
    
    print('Predicting...')
    recon_test = model(m_test)
    
    print('\n')
    recon_df_w_normalized_time = convert2df(recon_test, pad_matrix_test, cols_w_normalized_time, test_row_num)
    recon_df_w_time = getDfWithTime(recon_df_w_normalized_time, missing_true_test, min_max_storage)
    submission_df = getSubmission(recon_df_w_time, missing_true_test, complete_true_test, first_timestamp)
    
    print('Testing...')
    time, acc = evaluation(submission_df, nan_time_index_test, nan_activity_index_test,show=True)
    print('\n')
    
    print('Saving submission...')
    submission_df.to_csv(args.output_dir+'submission.csv', index=False)
    print('Done!')

Predicting...


Testing...
Number of missing Time: 552
Mean Absolute Error: 11.7684 day(s)
Number of missing Activity: 558
Accuracy: 78.49%


Saving submission...
Done!


In [30]:
#bpi2013
#7.6
#75%

In [31]:
#bpi2012
#0.59
#75%

In [32]:
#helpdesk
#3.9
#85