<a href="https://colab.research.google.com/github/l-dietrich/seminar-process-predictions-cnn/blob/main/cnn_pm_seminar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pm4py==2.2.32

In [None]:
from functools import reduce
import pandas as pd
import pm4py
import numpy as np
from enum import Enum
from datetime import timedelta, date, datetime
from math import floor
import math
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torchsummary import summary
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

if(torch.cuda.is_available()):
  cuda_id = torch.cuda.current_device()
  print(f"ID of current CUDA device: {torch.cuda.current_device()}")
          
  print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

In [None]:
# event_log = pm4py.read_xes("./helpdesk.xes")
# event_log = pm4py.read_xes("bpic_12_a.xes")
# event_log = pm4py.read_xes("bpic_12_o.xes")
event_log = pm4py.read_xes("bpic_12_w.xes")

In [None]:
print(f"End Activties (w/o artifical END activtiy): {pm4py.get_end_activities(event_log)}")

end_activity = 'END'

for trace in event_log:
  end_event = pm4py.objects.log.obj.Event()
  end_event['concept:name'] = end_activity
  end_event['time:timestamp'] = trace[-1]['time:timestamp']
  trace.append(end_event)

activity_domain = list(pm4py.get_event_attribute_values(event_log, "concept:name").keys())
activity_domain.sort()
print(f"Activity Domain: {activity_domain}")
print(f"Start Activites: {pm4py.get_start_activities(event_log)}")
print(f"End Activties:   {pm4py.get_end_activities(event_log)}")

In [None]:
def trace_vector(get_performance, trace, activity_domain=[]):
  if len(activity_domain) == 0:
    if len(trace) == 0:
      raise Exception("For empty traces an non-empty activity domain has to be specified.")
    activity_domain = list(set(trace))
  vec = np.zeros(len(activity_domain), dtype=np.float32)
  activity_trace = list(map(lambda event: event['concept:name'], trace))
  if(get_performance): 
    trace_start = date.min
    for index, event in enumerate(trace):
      if index == 0:
        trace_start = event['time:timestamp']
      else:
        index = activity_domain.index(event['concept:name'])
        vec[index] = (event['time:timestamp'] - trace_start).total_seconds()/(24*60*60)
  else:
    for index, activity in enumerate(activity_domain):
      vec[index] = activity_trace.count(activity)
  return vec

# Eventlog Preprocessing

In [None]:
class EventLogDataset(Dataset):
  def __init__(self, event_log, activity_domain, transform=None, target_transform=None, min_prefix_size = 2):
    self.event_log = event_log
    self.activity_domain = activity_domain
    self.transform = transform
    self.target_transform = target_transform
    self.prefix_traces = [] 
    self.images = []
    self.labels = []
    self.trace_idx = {}
    for trace in event_log:
      if len(trace) > min_prefix_size + 1:
        prefix_closure = [trace[:i+1] for i in range(len(trace))] 
        for i in range(min_prefix_size, len(prefix_closure)): # Skip prefixes smaller than min_prefix_size
          self.prefix_traces.append(prefix_closure[i-1])
          prefix_parikh_vec = np.vstack([trace_vector(False, prefix, self.activity_domain) for prefix in prefix_closure[:i]])
          prefix_performance_vec = np.vstack([trace_vector(True, prefix, self.activity_domain) for prefix in prefix_closure[:i]])
          self.images.append(np.stack([prefix_parikh_vec, prefix_performance_vec], axis=0))

          label = trace[i]['concept:name']
          label = activity_domain.index(label)
          self.labels.append(label)

          # Add trace index to map for finding original full traces 
          if(i == len(prefix_closure) - 1):
            self.trace_idx[trace.attributes['concept:name']] = len(self.labels) - 1

    self.max_image_shape = max(map(np.shape, self.images))

    # pad images to have same size and cast to tensor
    for index, image in enumerate(self.images):
      padding_size = self.max_image_shape[1] - image.shape[1]
      self.images[index] = np.pad(
          image, 
          [(0, 0), (padding_size, 0), (0, 0)],
          mode='constant',
          constant_values=0
      )


  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    image = self.images[idx]
    label = self.labels[idx]
    if self.transform:
      image = self.transform(image)
    if self.target_transform:
      label = self.target_transform(label)
    return image, label
  
  def to(self, device):
    self.images = torch.tensor(self.images, device = device)
    self.labels = torch.tensor(self.labels, device = device)

# CNN

In [None]:
BATCH_SIZE = 128
activity_domain = list(pm4py.get_event_attribute_values(event_log, "concept:name").keys())
activity_domain.sort()

dataset = EventLogDataset(event_log, activity_domain)
dataset.to(device)

train_size = floor(len(dataset) * 2/3)
train_dataset = Subset(dataset, range(0, train_size))
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

test_size = len(dataset) - train_size
test_dataset = Subset(dataset, range(train_size, train_size + test_size))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
class NextActivityModel(nn.Sequential):
  def __init__(self, image_shape):
    super(NextActivityModel, self).__init__()

    def _dim_reduction_per_pooling(dim):
          return floor((dim - 2) / 2) + 1


    c_in, h_in, w_in = image_shape #channels, height, width
    c_out, h_out, w_out = 16, h_in, w_in
    kernel_size = 1
    

    for i in range(3):
      if (
          _dim_reduction_per_pooling(h_out) > 0 and 
          _dim_reduction_per_pooling(w_out) > 0):
        c_out = c_out * 2
        kernel_size = kernel_size * 2
        self.append(nn.Conv2d(c_in, c_out, kernel_size, padding='same'))
        c_in = c_out
        self.append(nn.BatchNorm2d(c_out))
        h_out = _dim_reduction_per_pooling(h_out)
        w_out = _dim_reduction_per_pooling(w_out)
        self.append(nn.ReLU())
        self.append(nn.MaxPool2d(2, 2))
      else:
        break

    self.append(nn.Flatten(start_dim=1))
    self.append(nn.Linear(c_out * h_out * w_out, w_in))
    self.append(nn.Softmax(dim=1))

In [None]:
model = NextActivityModel(dataset.max_image_shape).to(device)
summary(model, dataset.max_image_shape, BATCH_SIZE)

In [None]:
def accuracy(outputs, labels):
    preds = torch.argmax(outputs, dim=1)
    return torch.sum(preds == labels).div(len(preds)).item()

def training_epoch(dataloader, model, loss_fn, optimizer):
  running_loss = 0. 
  last_loss = 0.

  for batch, (X, y) in enumerate(dataloader):
      # Compute prediction and loss
      pred = model(X)
      loss = loss_fn(pred, y)

      # Backpropagation
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # Gather data
      running_loss += loss.item()
      if batch % 10 == 9:
          last_loss = running_loss / 9 # loss per batch
          print('\t batch {} loss: {}'.format(batch + 1, last_loss))
          running_loss = 0.
  return last_loss

def test_epoch(dataloader, model, loss_fn):
  loss = 0
  acc = 0
  for X, y in dataloader:
    outputs = model(X)
    loss += criterion(outputs, y).item()
    acc += accuracy(outputs, y)

  return loss/len(dataloader), acc/len(dataloader)

In [None]:
EPOCHS = 100
PATIENCE = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2E-4, weight_decay= 4E-4)
history = pd.DataFrame(columns=["Training Loss","Test Loss","Training Accuracy","Test Accuracy"])

best_test_loss = math.inf
iterations_since_best_test_loss = 0

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}")
    print("-" * 30)
    
    model.train(True)
    avg_train_loss = training_epoch(train_loader, model, criterion, optimizer)
    model.train(False)

    train_acc = 0
    for X, y in train_loader:
      train_acc += accuracy(model(X), y)
    train_acc = train_acc / len(train_loader)

    avg_test_loss, test_acc = test_epoch(test_loader, model, criterion)
    history.loc[len(history)] = [avg_train_loss, avg_test_loss, train_acc, test_acc]

    print('LOSS\t train {} \t test {}'.format(avg_train_loss, avg_test_loss))
    print('ACC \t train {} \t test {}'.format(train_acc, test_acc))

    if avg_test_loss < best_test_loss:
      best_test_loss = avg_test_loss
      iterations_since_best_test_loss = 0
    else:
      if iterations_since_best_test_loss > PATIENCE:
        print(f'Stopping early as test loss has not improved for {PATIENCE} iterations')
        break
      else:
        iterations_since_best_test_loss += 1

print("Training Done!")

if torch.cuda.is_available():
  model.to("cpu")
  dataset.to("cpu")

# Evaluation

In [None]:
history

In [None]:
sns.lineplot(data=history[["Training Loss","Test Loss"]])

In [None]:
sns.lineplot(data=history[["Training Accuracy","Test Accuracy"]])

In [None]:
sns.histplot(list(map(lambda i: activity_domain[i][:min(len(activity_domain[i]),10)],dataset.labels)))
plt.xlabel("Prediction Target")
plt.xticks(rotation=45)

In [None]:
outputs = model(dataset.images)
pred = torch.argmax(outputs, dim=1)

cf_matrix = confusion_matrix(dataset.labels, pred, labels=range(len(activity_domain)))
df_cm = pd.DataFrame(cf_matrix, index = [i for i in activity_domain],
                     columns = [i for i in activity_domain])
print(accuracy(outputs, dataset.labels))
df_cm

In [None]:
print(classification_report(dataset.labels, pred, labels=range(len(activity_domain)), target_names=activity_domain))