In [1]:
#import sys
#sys.path.append('/home/juan/Research/Codes/Process Mining')

import os
import numpy as np
from sklearn.preprocessing import normalize
import pm4py
import hdbscan
from sklearn import metrics
from PetriNet2Vec import PetriNet2Vec
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import random
pm4py.util.constants.SHOW_PROGRESS_BAR = False
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

# Set the seed for reproducibility
random_seed = 42
os.environ['PYTHONHASHSEED'] = str(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

Loading Petri Nets models (dataset)

In [2]:
models = sorted(os.listdir('../Dataset/Models/'))
petriNets = []
for model in models:
    net, im, fm = pm4py.read_pnml(os.path.join('../Dataset/Models/', model))
    petriNets.append(net)

Methodology 1

In [3]:
epochs = 1000
repetitions = 10
df = pd.DataFrame(columns=['repetition', 'dimension', 'min_cluster_size', 'Silhouette'])

for repetition in range(repetitions):
    for dim in [4, 8, 16, 32]:
        print('embedding_dim:', dim, 'repetition:', repetition)
        model = PetriNet2Vec(embedding_dim=dim,
                            seed=random_seed, 
                            black_transitions=False, # Use the token 'None' to represent black boxes
                            workers=8)

        model._fitted = False # force model to retrain from zero
        model.fit(petriNets, epochs=epochs)

        embeddings_vectors = normalize(model.get_net_embeddings().copy())

        for i in range(2,6):    # Hierarchical DBSCAN with cosine similarity
            cluster = hdbscan.HDBSCAN(min_cluster_size=i) # Hierarchical DBSCAN with cosine similarity
            labels = cluster.fit_predict(embeddings_vectors)
            # Number of clusters in labels, ignoring noise if present.
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters == 0:
                Silhouette = 0
            else:
                Silhouette = metrics.silhouette_score(embeddings_vectors, labels)

            df = df.append({'repetition':int(repetition), 'dimension':int(dim), 'min_cluster_size':int(i), 'Silhouette':Silhouette}, ignore_index = True)

embedding_dim: 4 repetition: 0
embedding_dim: 8 repetition: 0
embedding_dim: 16 repetition: 0
embedding_dim: 32 repetition: 0
embedding_dim: 4 repetition: 1
embedding_dim: 8 repetition: 1
embedding_dim: 16 repetition: 1
embedding_dim: 32 repetition: 1
embedding_dim: 4 repetition: 2
embedding_dim: 8 repetition: 2
embedding_dim: 16 repetition: 2
embedding_dim: 32 repetition: 2
embedding_dim: 4 repetition: 3
embedding_dim: 8 repetition: 3
embedding_dim: 16 repetition: 3
embedding_dim: 32 repetition: 3
embedding_dim: 4 repetition: 4
embedding_dim: 8 repetition: 4
embedding_dim: 16 repetition: 4
embedding_dim: 32 repetition: 4
embedding_dim: 4 repetition: 5
embedding_dim: 8 repetition: 5
embedding_dim: 16 repetition: 5
embedding_dim: 32 repetition: 5
embedding_dim: 4 repetition: 6
embedding_dim: 8 repetition: 6
embedding_dim: 16 repetition: 6
embedding_dim: 32 repetition: 6
embedding_dim: 4 repetition: 7
embedding_dim: 8 repetition: 7
embedding_dim: 16 repetition: 7
embedding_dim: 32 repeti

In [4]:
for dim in [4, 8, 16, 32]:
    for i in range(2,6):
        print('dimensions:', dim, 'min_cluster:', i,
              'avg:', np.round(np.mean(df[(df['dimension'] == dim) & (df['min_cluster_size']==i)]['Silhouette']),2),
              'stc:', np.round(np.std(df[(df['dimension'] == dim) & (df['min_cluster_size']==i)]['Silhouette']),2))

dimensions: 4 min_cluster: 2 avg: 0.56 stc: 0.05
dimensions: 4 min_cluster: 3 avg: 0.62 stc: 0.04
dimensions: 4 min_cluster: 4 avg: 0.51 stc: 0.07
dimensions: 4 min_cluster: 5 avg: 0.53 stc: 0.06
dimensions: 8 min_cluster: 2 avg: 0.48 stc: 0.03
dimensions: 8 min_cluster: 3 avg: 0.45 stc: 0.02
dimensions: 8 min_cluster: 4 avg: 0.25 stc: 0.03
dimensions: 8 min_cluster: 5 avg: 0.25 stc: 0.02
dimensions: 16 min_cluster: 2 avg: 0.32 stc: 0.02
dimensions: 16 min_cluster: 3 avg: 0.32 stc: 0.02
dimensions: 16 min_cluster: 4 avg: 0.21 stc: 0.02
dimensions: 16 min_cluster: 5 avg: 0.19 stc: 0.02
dimensions: 32 min_cluster: 2 avg: 0.29 stc: 0.01
dimensions: 32 min_cluster: 3 avg: 0.28 stc: 0.01
dimensions: 32 min_cluster: 4 avg: 0.17 stc: 0.01
dimensions: 32 min_cluster: 5 avg: 0.19 stc: 0.01
