# Connect Google Drive

Connect your personal drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Copy pnml datasets into your personal drive and set the following paths

In [None]:
path = "/content/drive/MyDrive/ilp-matcher-bert"
csv_path = path + "/eval-data/ft-csv/"
dataset_path = path + "/eval-data/pnml/"

# BERT Similarity Calculation

Install BERT module for Sentence Similarity

In [None]:
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

Load fine-tuned BERT Model

In [None]:
uni_model = path+"/eval-data/bert-birth-sap"
birth_model = path+"/eval-data/bert-uni-sap"
sap_model = path+"/eval-data/bert-uni-birth"

Method for calculating the similarity between two given labels

In [None]:
def calcSim(label1, label2):
    l1_vec = model.encode(label1)
    l2_vec = model.encode(label2)

    sim_vec = cosine_similarity([l1_vec], [l2_vec])
    sim = sim_vec[0][0]

    return sim

# Parse Petri-Nets

Install Process Mining module and import neccesary packages

In [None]:
!pip install pm4py

from pm4py.objects.petri_net.importer import importer as pnml_importer

import csv
import pandas
import re
import os

Calculate the similarity between all combinations of labels of two different petri nets and write the similarities into a csv file

In [None]:
def calcSims(net1, net2, folder_name):
    places = net1.places
    transitions = net1.transitions
    arcs = net1.arcs

    places2 = net2.places
    transitions2 = net2.transitions
    arcs2 = net2.arcs

    f = open(csv_path + folder_name + "_dup.csv", "a", newline="")
    writer = csv.writer(f)

    for place in places:
        for arc in place.in_arcs:
            for place2 in places2:
                for arc2 in place2.in_arcs:
                    if isNull(arc.source.label) or isNull(arc2.source.label):
                        sim = 0.0
                    else:
                        sim = calcSim(str(arc.source.label), str(arc2.source.label))
                    triple = (arc.source.label, arc2.source.label, sim)
                    writer.writerow(triple)

    f.close()
    df = pandas.read_csv(csv_path + folder_name + "_dup.csv")
    df.drop_duplicates(subset=None, inplace=True)
    df.to_csv(csv_path + folder_name + ".csv", index=False)
    pass

Parse all pnml files within one dataset folder. Try out every combination and generate the csv file

In [None]:
def parseFiles(folder_name):
    files = os.listdir(dataset_path + folder_name)
    print(files)

    for file1 in files:
        net, initial_marking, final_marking = pnml_importer.apply(dataset_path + folder_name + "/" + file1)
        for file2 in files:
            net2, initial_marking2, final_marking2 = pnml_importer.apply(dataset_path + folder_name + "/" + file2)

            print(file1 + " | " + file2)
            calcSims(net, net2, folder_name)
    pass

Method checks wether a label is empty or containing a dummy

In [None]:
def isNull(l1):
    if not l1:
        return True
    elif re.match("tr[0-9]+|t[0-9]+", l1):
        return True
    elif re.match("p[0-9]+", l1):
        return True
    elif re.match("n[0-9]+", l1):
        return True
    if l1.isspace():
        return True
    else:
        return False
    pass


Parse all datasets

In [None]:
def parseAll():
    datasets = os.listdir(dataset_path)
    print(datasets)

    for dataset in datasets:
        parseFiles(dataset)
    pass

Combine all generated CSVs to a single big one

In [None]:
def combine():
    datasets = os.listdir(dataset_path)
    fout = open(path + "/ft-sims.csv", "a")
    # first file:
    for line in open(csv_path + "birth.csv"):
        fout.write(line)
    for line in open(csv_path + "uni.csv"):
        fout.write(line)
    for line in open(csv_path + "sap.csv"):
        fout.write(line)
    fout.close()
    pass

# Main

**Option 2:** Parse datasets separately one after another

In [None]:
model = SentenceTransformer(birth_model)
parseFiles("birth")

In [None]:
model = SentenceTransformer(sap_model)
parseFiles("sap")

In [None]:
model = SentenceTransformer(uni_model)
parseFiles("uni")

Combine all CSVs to a single big one

In [None]:
combine()