# Hierarchical clustering program using different distance measures

Use pm4py for importing different event logs found in logs folder

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import pm4py
import networkx as nx
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import pprint

# import local files
import Jaccard
import Simrank
import Log_processing
import Clustering

In [None]:
# TESTING

def weighted_simrank(G, node1, node2, C=0.8, max_iterations=100, tolerance=1e-6):
    if node1 == node2:
        return 1.0

    prev_sim = 0
    sim = 1
    iterations = 0

    while abs(sim - prev_sim) > tolerance and iterations < max_iterations:
        prev_sim = sim
        neighbors1 = set(G.predecessors(node1))
        neighbors2 = set(G.predecessors(node2))
        sim = C / (len(neighbors1) * len(neighbors2)) * sum(G[node1][x]['weight'] * G[node2][y]['weight'] * weighted_simrank(G, x, y, C, max_iterations, tolerance) for x in neighbors1 for y in neighbors2)
        iterations += 1

    return sim

def full_weighted_simrank(G):
    similarities = {}
    for node1 in G.nodes():
        node_similarity = {}
        for node2 in G.nodes():
            node_similarity[node2] = weighted_simrank(G, node1, node2)
        similarities[node1] = {}


In [None]:
weighted_jaccard_coselog = jaccard.Weighted_Jaccard("../logs/coselog.xes")
weighted_jaccard_coselog.perform_clustering(verbose=True)

In [None]:
simple_simrank_sepsis = Simrank.Simple_Simrank("../logs/sepsis_event_log.xes")
simple_simrank_sepsis.perform_clustering()

In [None]:
weighted_simrank_sepsis = Simrank.Weighted_Simrank("../logs/sepsis_event_log.xes")
weighted_simrank_sepsis.perform_clustering()

In [None]:
simple_jaccard = Jaccard.Simple_Jaccard("../logs/sepsis_event_log.xes")
simple_jaccard.perform_clustering(verbose=True)

In [None]:
coselog_simple_jaccard = Jaccard.Simple_Jaccard("../logs/coselog.xes")
coselog_simple_jaccard.perform_clustering()

In [None]:
bpic_simple_jaccard = Jaccard.Simple_Jaccard("../logs/Road_Traffic_Fine_Management_Process.xes")
bpic_simple_jaccard.perform_clustering()

In [None]:
simple_jaccard_filtered = Jaccard.Simple_Jaccard("../logs/sepsis_event_log.xes", filtered=True, num_top_k=62)
simple_jaccard_filtered.perform_clustering(verbose=True)

In [None]:
simple_jaccard_highly_filtered = Jaccard.Simple_Jaccard("../logs/sepsis_event_log.xes", filtered=True, num_top_k=11)
simple_jaccard_highly_filtered.perform_clustering(verbose=True)