In [None]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers import Bidirectional, LSTM, Activation, Dropout, Embedding, Input
from keras import regularizers
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

import pandas as pd

import numpy as np
import json
import math

import os.path

def save_log(loglist, filename): #save a list of lists 
  df = pd.DataFrame.from_records(loglist)
  df.to_csv(filename, index=False)

def remove_nan(lists):
  newlists = []
  for tr in lists:
    newlists.append([int(x) for x in tr if str(x) != 'nan'])
  return(newlists)

def import_log(filepath):
  df = pd.read_csv(filepath)
  return(remove_nan(df.values.tolist()))



def number_to_one_hot_X(X, dict_size): #if we want 
  newX = []
  for example in X:
    new_ex = []
    for i in range(len(example)):
      onehot = [0]*dict_size #changed
      if example[i] != 0:
        onehot[example[i] - 1] = 1 #-1 because begin counting at 0
      new_ex.append(onehot)
    newX.append(new_ex)
  return(np.array(newX))

def create_XY_prefix(log, mappingsize, prefixlen):
  X = []
  Y = []
  for i in range(0, len(log)):
    for k in range(1, len(log[i])):
      X.append(log[i][max(0, k-prefixlen):k]) #get the prefix of 'encoded' activities
      y = [0] *(mappingsize)
      y[int(log[i][k])-1] = 1
      Y.append(y)        
  X = keras.preprocessing.sequence.pad_sequences(X, maxlen=prefixlen, padding='pre')
  X = number_to_one_hot_X(X, mappingsize)
  return(np.array(X), np.array(Y))

def get_startend(log): 
  return log[0][0], log[0][-1]

def get_model(maxlen, num_chars, bidirec, n_layers, lstmsize, dropout, l1, l2):
  model = Sequential()
  model.add(Input(shape=(maxlen, num_chars))) #If you don't use an embedding layer input should be one-hot-encoded
  if bidirec == False:   
    model.add(LSTM(lstmsize,kernel_initializer='glorot_uniform',return_sequences=(n_layers != 1),kernel_regularizer=regularizers.l1_l2(l1,l2),
                   recurrent_regularizer=regularizers.l1_l2(l1,l2),input_shape=(maxlen, num_chars)))
    model.add(Dropout(dropout))
    for i in range(1, n_layers):
      return_sequences = (i+1 != n_layers)
      model.add(LSTM(lstmsize,kernel_initializer='glorot_uniform',return_sequences=return_sequences,
                     kernel_regularizer=regularizers.l1_l2(l1,l2),recurrent_regularizer=regularizers.l1_l2(l1,l2)))
      model.add(Dropout(dropout))
  else:
    model.add(Bidirectional(LSTM(lstmsize,kernel_initializer='glorot_uniform',return_sequences=(n_layers != 1),kernel_regularizer=regularizers.l1_l2(l1,l2),
                   recurrent_regularizer=regularizers.l1_l2(l1,l2),input_shape=(maxlen, num_chars))))
    model.add(Dropout(dropout))
    for i in range(1, n_layers):
      return_sequences = (i+1 != n_layers)
      model.add(Bidirectional(LSTM(lstmsize,kernel_initializer='glorot_uniform',return_sequences=return_sequences,
                     kernel_regularizer=regularizers.l1_l2(l1,l2),recurrent_regularizer=regularizers.l1_l2(l1,l2))))
      model.add(Dropout(dropout))
  model.add(Dense(num_chars, kernel_initializer='glorot_uniform',activation='softmax'))
  opt = Adam(learning_rate=0.005)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics='accuracy')
  return model


def train_model(X_train, y_train,batch_size, maxlen, num_chars, bidirec, n_layers, lstmsize, dropout, l1, l2):
  model = get_model(maxlen, num_chars, bidirec, n_layers, lstmsize, dropout, l1, l2)
  model.summary()
  early_stopping = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)
  lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
  #train_model
  history = model.fit(X_train, y_train, validation_split=0.2, callbacks=[early_stopping, lr_reducer], batch_size=batch_size, epochs=600, verbose=2)
  return model

def cut_end(log, endact):
  logsize, tracesize = log.shape
  print(log.shape)
  newlog = []
  for i in range(0, logsize):
    trace = []
    for j in range(0, tracesize):
      if log[i][j] == endact:
        trace.append(log[i][j])
        break
      else:
        trace.append(log[i][j])
    newlog.append(trace)
  return(newlog)

def normalize(probs): #normalize probabilities to sum to 1
  examplesize, actsize = probs.shape
  newy = []
  for i in range(examplesize):
    normalizer = 1 / float( sum(probs[i]) )
    ynorm = [float(l) * normalizer for l in probs[i]]
    newy.append(ynorm)
  return newy


def choose_act_all(all_y): #randomly choose an activity, stochastically
  #p want a list of probabilities    
  chosen_acts = []
  for i in range(len(all_y)):
      chosen_acts.append(np.random.choice(np.arange(0, len(all_y[i])), p=all_y[i])+1)  
  return(chosen_acts)   # +1 because number encodig starts at 1 not 0

def OHget_probabilities(rnnmodel, xlists,  nr_act, maxlen, prefixlen):
  #assume xlist is a list with the x (prefix) untill now 
  all_x = keras.preprocessing.sequence.pad_sequences(xlists, maxlen=maxlen, padding="pre")
  all_x = all_x[:,-(prefixlen):]
  all_x = number_to_one_hot_X(all_x, nr_act)
  results = rnnmodel.predict(all_x)
  return results

def OHsimulate_log(RNNmodel, logsize, startact, endact, maxlen, mapping, prefixlen): #Use RNN to simulate log
  log = np.zeros((logsize, maxlen+1), int)
  for i in range(0, logsize): #start every trace with the start activity
    log[i][0] = startact
  print(log)
  for j in range(1,maxlen+1): #check if 0 or 1 and ml or ml - 1 #we took 50 for with loops   
    print("finding activity nr", j+1)   
    prefixes = np.array([log[i][0:j] for i in range(0, logsize)])
    print(prefixes)
    probs = OHget_probabilities(RNNmodel, prefixes, len(mapping), maxlen, prefixlen)
    #we need to do this because otherwise probabilities sum over 1 
    ynorm = normalize(probs) 
    nextacts = choose_act_all(ynorm) 
    for i in range(0, logsize):
      log[i][j] = nextacts[i]
  print(log)
  corrected_log = cut_end(log, endact)      
  return(corrected_log) 


def do_experiment(full_log_location, train_log_location, sim_log_location, map_location, fold, full_prefix, opt_prefixlen, size_sim_log,
                 bidirec=True, n_layers=1, lstmsize=64, dropout=0.4, l1=0.001, l2=0.001):
  full_log = remove_nan(import_log(full_log_location))

  maxlen = len(max(full_log,key=len))
  #if we want to use the full prefix each time or not
  if full_prefix == True:
    prefixlen=maxlen - 1
    print("prefix length:", prefixlen)
  else:
    prefixlen=opt_prefixlen

  #reload mapping
  mappingfilename = map_location
  with open(mappingfilename) as f:
    mapping = json.loads(f.read())

  batch_size = 128


  for i in range(0, fold):
    if os.path.exists(SimLogPath):
      print("Already done: ", i)
      continue
    train_log = import_log(train_log_location)
    start,end = get_startend(train_log)
    X_train, y_train = create_XY_prefix(train_log, len(mapping), prefixlen)
    model = train_model(X_train, y_train,batch_size, maxlen=prefixlen, num_chars=len(mapping), bidirec=bidirec, n_layers=n_layers, lstmsize=lstmsize, dropout=dropout, l1=l1, l2=l2)

    simlog = OHsimulate_log(model, size_sim_log, start, end, maxlen-1, mapping, prefixlen)
    
    save_log(simlog, sim_log_location)

Mounted at /content/drive
