In [1]:
import glob
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM, GRU
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from datetime import datetime, timedelta
from collections import defaultdict
import os

Using Theano backend.


In [2]:
data_path = "/storage/hpc_irheta/bpic2013/"
folds_dir = os.path.join(data_path, "folds")
n_folds = 5
weights_file_template = "bpic2013/bpic2013_fold%s_weights*.hdf5"
generated_traces_ratios = [1, 2, 5, 10] # ratios to single fold size

In [3]:
case_id_col = "Case ID"
activity_col = "Activity"
timestamp_col = "Complete Timestamp"
cat_cols = [activity_col]
start_event = "START"
end_event = "END"

# LSTM params
lstmsize = 48
dropout = 0.5
optim = 'rmsprop'
loss = 'categorical_crossentropy'
nb_epoch = 10
activation='softmax'

In [4]:
def get_event_as_onehot(event_idx, data_dim):
    event = np.zeros(data_dim)
    event[event_idx] = 1
    return event

def generate_trace(start_idx, data_dim, end_event, time_dim, col_idxs):
    event_idx = start_idx
    events = get_event_as_onehot(event_idx, data_dim)[np.newaxis,:]
    trace = []
    while col_idxs[event_idx] != end_event:# and len(trace) < max_events:
        event_idx = np.random.choice(len(col_idxs), 1, p=model.predict(pad_sequences(events[np.newaxis,:,:], maxlen=time_dim))[0])[0]
        event = get_event_as_onehot(event_idx, data_dim)
        events = np.vstack([events, get_event_as_onehot(event_idx, data_dim)])
        trace.append(col_idxs[event_idx])
    return tuple(trace[:-1])

In [34]:
for fold_nr in range(n_folds):
    lstm_weights_file = glob.glob(weights_file_template%(fold_nr))[-1]
    
    # Read the relevant folds
    fold_files = os.listdir(folds_dir)
    data = pd.DataFrame()
    for file_idx in range(len(fold_files)):
        if file_idx != fold_nr:
            tmp = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")
            data = pd.concat([data, tmp], axis=0)
        else:
            val_data = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")

    # which traces exist in the train and val logs
    train_traces = set()
    grouped = data.groupby(case_id_col)
    for name, group in grouped:
        group = group.sort_values(timestamp_col)
        train_traces.add(tuple(group[activity_col]))
        
    val_traces = set()
    grouped_val = val_data.groupby(case_id_col)
    for name, group in grouped_val:
        group = group.sort_values(timestamp_col)
        val_traces.add(tuple(group[activity_col]))

    # prepare data
    cat_data = pd.get_dummies(data[cat_cols])
    dt_final = pd.concat([data[[case_id_col, timestamp_col]], cat_data], axis=1).fillna(0)
    dt_final[start_event] = 0
    dt_final[end_event] = 0
    grouped = dt_final.groupby(case_id_col)
    n_existing_traces = len(grouped)

    # generate dict of activity idxs
    col_idxs = {idx:col.replace("%s_"%activity_col, "") for idx, col in enumerate(cat_data.columns)}
    col_idxs[len(col_idxs)] = start_event
    col_idxs[len(col_idxs)] = end_event
    start_idx = col_idxs.keys()[col_idxs.values().index(start_event)]


    # load LSTM model
    max_events = grouped.size().max()
    data_dim = dt_final.shape[1] - 2
    time_dim = max_events + 1

    model = Sequential()
    model.add(LSTM(lstmsize, input_shape=(time_dim, data_dim)))
    model.add(Dropout(dropout))
    model.add(Dense(data_dim, activation=activation))
    model.compile(loss=loss, optimizer=optim)

    model.load_weights(lstm_weights_file)
    
    print("Fold no %s:"%fold_nr)
    print("Number of distinct traces in train set: %s, val set: %s, total: %s"%(len(train_traces), len(val_traces), len(train_traces.union(val_traces))))
    print("Number of distinct traces in val set not present in train set: %s"%(len(val_traces.difference(train_traces))))
    print("\n")
    
    for generated_traces_ratio in generated_traces_ratios:
        n_generated_traces = len(grouped_val) * generated_traces_ratio
    
        # generate new traces
        n_existing_in_train = defaultdict(int)
        n_existing_in_validation = defaultdict(int)
        n_new = defaultdict(int)
        np.random.seed(22)
        for i in range(n_generated_traces):
            trace = generate_trace(start_idx, data_dim, end_event, time_dim, col_idxs)
            if trace in train_traces:
                n_existing_in_train[trace] += 1
            elif trace in val_traces:
                n_existing_in_validation[trace] += 1
            else:
                n_new[trace] += 1
                
        print("Total traces generated: %s, ratio to fold size: %s"%(n_generated_traces, generated_traces_ratio))
        print("Number of existing traces in training set: %s, distinct: %s"%(sum(n_existing_in_train.values()), len(n_existing_in_train)))
        print("Number of existing traces in validation set: %s, distinct: %s"%(sum(n_existing_in_validation.values()), len(n_existing_in_validation)))
        print("Number of new traces: %s, distinct: %s"%(sum(n_new.values()), len(n_new)))
        print("\n")
        
    print("\n")



Fold no 0:
Number of distinct traces in train set: 1902, val set: 549, total: 2278
Number of distinct traces in val set not present in train set: 376
Total traces generated: 1511, ratio to fold size: 1
Number of existing traces in training set: 1093, distinct: 144
Number of existing traces in validation set: 11, distinct: 8
Number of new traces: 407, distinct: 364


Total traces generated: 3022, ratio to fold size: 2
Number of existing traces in training set: 2163, distinct: 197
Number of existing traces in validation set: 19, distinct: 11
Number of new traces: 840, distinct: 710


Total traces generated: 7555, ratio to fold size: 5
Number of existing traces in training set: 5390, distinct: 269
Number of existing traces in validation set: 55, distinct: 27
Number of new traces: 2110, distinct: 1610


Total traces generated: 15110, ratio to fold size: 10
Number of existing traces in training set: 10677, distinct: 330
Number of existing traces in validation set: 112, distinct: 41
Number o

KeyboardInterrupt: 

In [5]:
generated_log_template = "/storage/hpc_irheta/bpic2013/BPIC13_i_generated_fold%s_sizeratio%s.csv"

# generate log
for fold_nr in range(1):
    lstm_weights_file = glob.glob(weights_file_template%(fold_nr))[-1]
    
    # Read the relevant folds
    fold_files = os.listdir(folds_dir)
    data = pd.DataFrame()
    for file_idx in range(len(fold_files)):
        if file_idx != fold_nr:
            tmp = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")
            data = pd.concat([data, tmp], axis=0)
        else:
            val_data = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")

    grouped_val = val_data.groupby(case_id_col)

    # prepare data
    cat_data = pd.get_dummies(data[cat_cols])
    dt_final = pd.concat([data[[case_id_col, timestamp_col]], cat_data], axis=1).fillna(0)
    dt_final[start_event] = 0
    dt_final[end_event] = 0
    grouped = dt_final.groupby(case_id_col)
    n_existing_traces = len(grouped)

    # generate dict of activity idxs
    col_idxs = {idx:col.replace("%s_"%activity_col, "") for idx, col in enumerate(cat_data.columns)}
    col_idxs[len(col_idxs)] = start_event
    col_idxs[len(col_idxs)] = end_event
    start_idx = col_idxs.keys()[col_idxs.values().index(start_event)]


    # load LSTM model
    max_events = grouped.size().max()
    data_dim = dt_final.shape[1] - 2
    time_dim = max_events + 1

    model = Sequential()
    model.add(LSTM(lstmsize, input_shape=(time_dim, data_dim)))
    model.add(Dropout(dropout))
    model.add(Dense(data_dim, activation=activation))
    model.compile(loss=loss, optimizer=optim)

    model.load_weights(lstm_weights_file)
    
    
    for generated_traces_ratio in [5]:
        with open(generated_log_template%(fold_nr, generated_traces_ratio), "w") as fout:
            fout.write("%s,%s,%s\n"%("Case ID", "Activity", "Complete Timestamp"))
            n_generated_traces = len(grouped_val) * generated_traces_ratio

            # generate new traces
            np.random.seed(22)
            for i in range(n_generated_traces):
                trace = generate_trace(start_idx, data_dim, end_event, time_dim, col_idxs)
                start_time = datetime.now()
                for event in trace:
                    timestamp = datetime.strftime(start_time + timedelta(days=1), '%Y/%m/%d %H:%M:%S.%f')
                    fout.write("%s,%s,%s\n"%("new%s"%(i+1), event, timestamp))



In [5]:
for fold_nr in range(1):
    lstm_weights_file = glob.glob(weights_file_template%(fold_nr))[-1]
    
    # Read the relevant folds
    fold_files = os.listdir(folds_dir)
    data = pd.DataFrame()
    for file_idx in range(len(fold_files)):
        if file_idx != fold_nr:
            tmp = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")
            data = pd.concat([data, tmp], axis=0)
        else:
            val_data = pd.read_csv(os.path.join(folds_dir, fold_files[file_idx]), sep=";")

    # which traces exist in the train and val logs
    train_traces = set()
    grouped = data.groupby(case_id_col)
    for name, group in grouped:
        group = group.sort_values(timestamp_col)
        train_traces.add(tuple(group[activity_col]))
        
    val_traces = set()
    grouped_val = val_data.groupby(case_id_col)
    for name, group in grouped_val:
        group = group.sort_values(timestamp_col)
        val_traces.add(tuple(group[activity_col]))

    # prepare data
    cat_data = pd.get_dummies(data[cat_cols])
    dt_final = pd.concat([data[[case_id_col, timestamp_col]], cat_data], axis=1).fillna(0)
    dt_final[start_event] = 0
    dt_final[end_event] = 0
    grouped = dt_final.groupby(case_id_col)
    n_existing_traces = len(grouped)

    # generate dict of activity idxs
    col_idxs = {idx:col.replace("%s_"%activity_col, "") for idx, col in enumerate(cat_data.columns)}
    col_idxs[len(col_idxs)] = start_event
    col_idxs[len(col_idxs)] = end_event
    start_idx = col_idxs.keys()[col_idxs.values().index(start_event)]


    # load LSTM model
    max_events = grouped.size().max()
    data_dim = dt_final.shape[1] - 2
    time_dim = max_events + 1

    model = Sequential()
    model.add(LSTM(lstmsize, input_shape=(time_dim, data_dim)))
    model.add(Dropout(dropout))
    model.add(Dense(data_dim, activation=activation))
    model.compile(loss=loss, optimizer=optim)

    model.load_weights(lstm_weights_file)
    
    
    for generated_traces_ratio in [5]:
        n_generated_traces = len(grouped_val) * generated_traces_ratio
    
        # generate new traces
        n_existing_in_train = defaultdict(int)
        n_existing_in_validation = defaultdict(int)
        n_new = defaultdict(int)
        np.random.seed(22)
        for i in range(n_generated_traces):
            trace = generate_trace(start_idx, data_dim, end_event, time_dim, col_idxs)
            if trace in train_traces:
                n_existing_in_train[trace] += 1
            elif trace in val_traces:
                n_existing_in_validation[trace] += 1
            else:
                n_new[trace] += 1




In [6]:
len(n_new)

1610

In [11]:
import operator
i = 0
for k, v in sorted(n_new.items(), key=operator.itemgetter(1), reverse=True):
    print(k,v)
    print(" ")
    if i > 10:
        break
    i += 1
(('Accepted\\\\In Progress', 'Accepted\\\\In Progress',

(('Completed\\\\Resolved', 'Completed\\\\Closed'), 37)
 
(('Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Completed\\\\In Call'), 36)
 
(('Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Completed\\\\Resolved', 'Completed\\\\Closed'), 16)
 
(('Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Completed\\\\In Call'), 15)
 
(('Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Completed\\\\Resolved', 'Completed\\\\Closed'), 13)
 
(('Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Acce

In [15]:
trie = {}
for trace in n_existing_in_validation.keys():
    current_trie_position = trie
    for i in range(len(trace)):
        if trace[i] not in current_trie_position:
            current_trie_position[trace[i]] = {}
        current_trie_position = current_trie_position[trace[i]]

In [22]:
trace_no = 1
current_trace_no = 0
for trace in n_new.keys():
    if current_trace_no == trace_no:
        path = []
        current_trie_position = trie
        for i in range(len(trace)):
            if trace[i] not in current_trie_position:
                print("Path so far: ", path)
                print("\n")
                print("Good traces: ", current_trie_position)
                print("\n")
                print("Bad trace: ", [trace[j] for j in range(i, len(trace))])
                break
            else:
                path.append(trace[i])
                current_trie_position = current_trie_position[trace[i]]
        break
    current_trace_no += 1

('Path so far: ', ['Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Accepted\\\\Assigned', 'Accepted\\\\In Progress', 'Queued\\\\Awaiting Assignment', 'Accepted\\\\In Progress', 'Accepted\\\\Assigned', 'Accepted\\\\In Progress'])


('Good traces: ', {'Completed\\\\Resolved': {'Completed\\\\Closed': {}}})


('Bad trace: ', ['Accepted\\\\Assigned', 'Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Accepted\\\\In Progress', 'Completed\\\\Resolved', 'Completed\\\\Closed'])


In [24]:
from ete2 import Tree

In [25]:
t = Tree( "((a,b),c);" )
t.show()

ImportError: No module named PyQt4.QtGui

In [30]:
import matplotlib.pyplot as plt
import numpy as np
import plotly.plotly as py

py.sign_in('irene.teinemaa', 'O4pvM5Ko2ywAXqfBWQkk')
n = 50
x, y, z, s, ew = np.random.rand(5, n)
c, ec = np.random.rand(2, n, 4)
area_scale, width_scale = 500, 5

fig, ax = plt.subplots()
sc = ax.scatter(x, y, c=c,
                s=np.square(s)*area_scale,
                edgecolor=ec,
                linewidth=ew*width_scale)
ax.grid()

py.iplot_mpl(fig)





High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~irene.teinemaa/0 or inside your plot.ly account where it is named 'plot from API'


In [37]:
import plotly.plotly as py
import plotly.graph_objs as go

import igraph
from igraph import *
igraph.__version__

DeprecationWarning: To avoid name collision with the igraph project, this visualization library has been renamed to 'jgraph'. Please upgrade when convenient.

In [35]:
nr_vertices = 25
v_label = map(str, range(nr_vertices))
G = Graph.Tree(nr_vertices, 2) # 2 stands for children number
lay = G.layout('rt')

position = {k: lay[k] for k in range(nr_vertices)}
Y = [lay[k][1] for k in range(nr_vertices)]
M = max(Y)

es = EdgeSeq(G) # sequence of edges
E = [e.tuple for e in G.es] # list of edges

L = len(position)
Xn = [position[k][0] for k in range(L)]
Yn = [2*M-position[k][1] for k in range(L)]
Xe = []
Ye = []
for edge in E:
    Xe+=[position[edge[0]][0],position[edge[1]][0], None]
    Ye+=[2*M-position[edge[0]][1],2*M-position[edge[1]][1], None] 

labels = v_label


NameError: name 'Graph' is not defined