In [1]:
def segment_level_events(event_data, segment):
    """
    Computes all segment level events given an event data and segment name.

    :param event_data: event data used (Performance_Spectrum.Eventdata)
    :param segment: name of segment to compute events for
    :return: pandas Dataframe containing all segment level events for 'segment' in 'event_data'
    """
    data = event_data.pf.copy()
    data = data[data["segment_name"] == segment]
    data["case_id"] = data["case_id"].astype(int)
    data["start_time"] = data["start_time"].apply(lambda hours: (
        event_data.first + datetime.timedelta(days=np.floor(hours / 24), hours=(hours % 24))).strftime("%Y-%m-%d"))
    data["end_time"] = data["end_time"].apply(lambda hours: (
        event_data.first + datetime.timedelta(days=np.floor(hours / 24), hours=(hours % 24))).strftime("%Y-%m-%d"))
    data["duration"] = pd.to_datetime(
        data["end_time"], format='%Y-%m-%d') - pd.to_datetime(data["start_time"], format='%Y-%m-%d')
    data["duration"] = data["duration"].apply(lambda td: td.days)
    
    data["start_resource"] = data["start_org:resource"].str.split("_").str[1]
    data["start_resource"] = data["start_resource"].astype(int)
    data["end_resource"] = data["end_org:resource"].str.split("_").str[1]
    data["end_resource"] = data["end_resource"].astype(int)
    data["resources"] = list(zip(data["start_resource"], data["end_resource"]))
    
    data = data[["segment_id", "segment_name", "start_time", "end_time", "duration", "case_id", "resources"]]

    return data

In [2]:
def system_level_events_delayed(segment_level_events, treshold_occurence=1, treshold_duration=0):
    """
    Computes all delayed system level events (same start and end date) from the segment level events given.

    :param segment_level_events: pandas DataFrame containing all segment level events
    :param treshold_occurence: a treshold for the minimal number of cases that must be in the system level events
    :param treshold_duration: a treshold for the minimal duration of the segments in the system level events
    :return: pandas Dataframe containing all computed system level events, filtered on the 2 tresholds
    """
    data = segment_level_events.copy()
    data = data.groupby(['segment_name', 'start_time', 'end_time', 'duration'])[['case_id', 'segment_id', 'resources']].agg(list).reset_index()
    data = data.rename(columns={'case_id': 'cases', 'segment_id': 'segments'})
    
    if data.empty:
        return data
    
    data['nr_cases'] = data['cases'].str.len()
    
    data["start_users"] = data["resources"].apply(lambda x: [*set(i for i, _ in x)])
    data["end_users"] = data["resources"].apply(lambda x: [*set(i for _, i in x)])

    data = data[data["duration"] >= treshold_duration]
    data = data[data['nr_cases'] >= treshold_occurence]
    
    data = data[["segment_name", "start_time", "end_time", "duration", "nr_cases", "start_users", "end_users", "cases", "segments"]]
    
    return data

In [3]:
def system_level_events_batching_on_end(segment_level_events, treshold_occurence=1):
    """
    Computes all batching on end system level events with the same end date from the segment level events given.

    :param segment_level_events: pandas DataFrame containing all segment level events
    :param treshold_occurence: a treshold for the minimal number of cases that must be in the system level events
    :return: pandas Dataframe containing all computed batching on end system level events, filtered on the 2 tresholds
    """
    data = segment_level_events.copy()
    
    if data.empty:
        return data
    
    data["startpoints"] = list(zip(data.start_time, data.duration, data.case_id))
    data = data.groupby(['segment_name', 'end_time'])[['startpoints', 'segment_id', 'resources']].agg(list).reset_index() 
    
    if data.empty:
        return data
      
    data = data.rename(columns={"segment_id": "segments"})
    data["cases"] = data["startpoints"].apply(lambda x: [i for _, _, i in x])
    data["nr_cases"] = data["cases"].str.len()
    data = data[data["nr_cases"] > treshold_occurence]
    
    data["start_users"] = data["resources"].apply(lambda x: [*set(i for i, _ in x)])
    data["end_users"] = data["resources"].apply(lambda x: [*set(i for _, i in x)])  
    
    data = data[["segment_name", "end_time", "nr_cases", "start_users", "end_users", "cases", "segments", "startpoints"]]

    return data

In [4]:
def system_level_events_batching_on_start(segment_level_events, treshold_occurence=1):
    """
    Computes all batching on start system level events with the same start date from the segment level events given.

    :param segment_level_events: pandas DataFrame containing all segment level events
    :param treshold_occurence: a treshold for the minimal number of cases that must be in the system level events
    :return: pandas Dataframe containing all computed batching on start system level events, filtered on the 2 tresholds
    """
    data = segment_level_events.copy()
    
    if data.empty:
        return data
    
    data["endpoints"] = list(zip(data.end_time, data.duration, data.case_id))
    data = data.groupby(['segment_name', 'start_time'])[
        ['endpoints', 'segment_id', 'resources']].agg(list).reset_index() 
    
    if data.empty:
        return data
     
    data = data.rename(columns={"segment_id": "segments"})
    data["cases"] = data["endpoints"].apply(lambda x: [i for _, _, i in x])
    data["nr_cases"] = data["cases"].str.len()
    data = data[data["nr_cases"] > treshold_occurence]
    
    data["start_users"] = data["resources"].apply(lambda x: [*set(i for i, _ in x)])
    data["end_users"] = data["resources"].apply(lambda x: [*set(i for _, i in x)])
    
    data = data[["segment_name", "start_time", "nr_cases", "start_users", "end_users", "cases", "segments", "endpoints"]]

    return data

In [5]:
def system_level_events_high_workload(segment_level_events, treshold_workload=1, same_user=True):
    """
    Computes all high workload system level events with the same end date from the segment level events given.

    :param segment_level_events: pandas DataFrame containing all segment level events
    :param treshold_workload: a treshold for the minimal workload per system level event
    :return: pandas Dataframe containing all computed high workload system level events, filtered on the 2 tresholds
    """
    data = segment_level_events.copy()
    
    if data.empty:
        return data
    
    data["startpoints"] = list(zip(data.start_time, data.duration, data.case_id))
    data = data.groupby(['segment_name', 'end_time'])[['startpoints', 'segment_id', 'resources']].agg(list).reset_index()  
    
    if data.empty:
        return data
      
    data = data.rename(columns={"segment_id": "segments"})  
    data["cases"] = data["startpoints"].apply(lambda x: [i for _, _, i in x])
    data["nr_cases"] = data["cases"].str.len()
    
    data["start_users"] = data["resources"].apply(lambda x: [*set(i for i, _ in x)])
    data["end_users"] = data["resources"].apply(lambda x: [*set(i for _, i in x)])
    data["ratio_start_end"] = data["start_users"].str.len() / data["end_users"].str.len()
    data["ratio_workload"] = data["nr_cases"] / data["end_users"].str.len()
    
    if same_user:
        data = data[data["ratio_workload"] > treshold_workload] 
    else:
        data = data[data["ratio_start_end"] > treshold_workload]

    data = data[["segment_name", "end_time", "nr_cases", "ratio_workload", "ratio_start_end", "start_users", "end_users", "cases", "segments", "startpoints"]]

    return data

In [7]:
def remove_duplicate_events(events):
    data = events.copy()
    
    base = data[(data["type"] != "BEN") & (data["type"] != "BST")]

    bst = data[data["type"] == "BST"]
    ben = data[data["type"] == "BEN"]
    bat = data[(data["type"] == "BST") | (data["type"] == "BEN")]

    bat = bat.drop_duplicates(subset=["start_activity", "end_activity", "start", "end"]).reset_index()
    
    for i, row in bat.iterrows():
        if ((row["start"][0] == row["start"][1]) & (row["end"][0] == row["end"][1])):
            bat.at[i, "type"] = "BAT"
    
    data = pd.concat([base, bat]).reset_index()
    data = data[["start_activity", "end_activity", "start", "end", "type", "info", "label", "cases"]]
    
    return data

In [None]:
def system_level_events(event_log, segments, tresholds, event_type, ext_label, jenks_treshold=63):
    """
    Acts as a wrapper around system level events to prepare them fore cascade detection

    :param event_log: event log used (Performance_Spectrum.EventLog)
    :param segments: name of segments to compute events for
    :param tresholds: tresholds for the events
    :return: pandas Dataframe containing all computed system level events
    """
    
    sys_events = []
    
    for i, segment in enumerate(segments):
        seg_events = segment_level_events(event_log, segment)
        
        if event_type == "batching_end" or event_type == "workload" or event_type == "workload_hand":
            if event_type == "batching_end":
                events = system_level_events_batching_on_end(seg_events, max(jenks_treshold, tresholds[i]))
            elif event_type == "workload":
                events = system_level_events_high_workload(seg_events, tresholds[i])       
            elif event_type == "workload_hand":
                events = system_level_events_high_workload(seg_events, tresholds[i], same_user=False)
            
            if not events.empty:
                if event_type == "batching_end":
                    events["info"] = events["cases"].str.len()
                if event_type == "workload":
                    events["info"] = events["ratio_workload"]
                if event_type == "workload_hand":
                    events["info"] = events["ratio_start_end"]  
            
                # Date related operations
                dates = events["startpoints"].apply(lambda x: [i for i, _, _ in x])
                datetimes = dates.apply(lambda x: [datetime.datetime.strptime(date, "%Y-%m-%d") for date in x])
                firstdates = datetimes.apply(lambda x: min(x))
                lastdates = datetimes.apply(lambda x: max(x))
                enddates = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
                events["start_date"] = list(zip(firstdates.apply(lambda x:(x.strftime("%y%m%d"))), 
                                           lastdates.apply(lambda x:(x.strftime("%y%m%d")))))
                events["end_date"] = list(zip(events["end_time"], events["end_time"]))
                firstdates = firstdates - FIRST
                lastdates = lastdates - FIRST

                end = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
                end = end - FIRST
                events["date_label"] = enddates.astype(str)         
            
                events["start"] = list(zip((firstdates.dt.days * 24 + round(firstdates.dt.seconds / 3600)).astype(int), 
                                           (lastdates.dt.days * 24 + round(lastdates.dt.seconds / 3600)).astype(int)))
                events["end"] = list(zip((end.dt.days * 24 + round(end.dt.seconds / 3600)).astype(int), 
                                         (end.dt.days * 24 + round(end.dt.seconds / 3600)).astype(int)))
        elif event_type == "batching_start":
            events = system_level_events_batching_on_start(seg_events, max(jenks_treshold, tresholds[i]))
            
            if not events.empty:
                events["info"] = events["cases"].str.len()
            
                # Date related operations
                dates = events["endpoints"].apply(lambda x: [i for i, _, _ in x])
                datetimes = dates.apply(lambda x: [datetime.datetime.strptime(date, "%Y-%m-%d") for date in x])
                firstdates = datetimes.apply(lambda x: min(x))
                lastdates = datetimes.apply(lambda x: max(x))
                startdates = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
                events["end_date"] = list(zip(firstdates.apply(lambda x:(x.strftime("%Y-%m-%d"))), 
                                           lastdates.apply(lambda x:(x.strftime("%Y-%m-%d")))))
                events["start_date"] = list(zip(events["start_time"], events["start_time"]))

                firstdates = firstdates - FIRST
                lastdates = lastdates - FIRST

                start = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
                start = start - FIRST

                events["date_label"] = startdates.astype(str)
            
                events["start"] = list(zip((start.dt.days * 24 + round(start.dt.seconds / 3600)).astype(int), 
                                           (start.dt.days * 24 + round(start.dt.seconds / 3600)).astype(int)))

                events["end"] = list(zip((firstdates.dt.days * 24 + round(firstdates.dt.seconds / 3600)).astype(int), 
                                         (lastdates.dt.days * 24 + round(lastdates.dt.seconds / 3600)).astype(int)))
        elif event_type == "delayed":
            events = system_level_events_delayed(seg_events, 10, tresholds[i])
            
            if not events.empty:
                events["info"] = events["duration"]

                # Date related operations
                events["start_date"] = list(zip(events["start_time"], events["start_time"]))
                events["end_date"] = list(zip(events["end_time"], events["end_time"]))
                startdates = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
                enddates = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
                start = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
                start = start - FIRST
                end = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
                end = end - FIRST

                events["date_label"] = startdates.astype(str) + "-" + enddates.astype(str)

                if not events.empty:
                    events["start"] = list(zip((start.dt.days * 24 + round(start.dt.seconds / 3600)).astype(int), 
                                               (start.dt.days * 24 + round(start.dt.seconds / 3600)).astype(int)))
                    events["end"] = list(zip((end.dt.days * 24 + round(end.dt.seconds / 3600)).astype(int), 
                                             (end.dt.days * 24 + round(end.dt.seconds / 3600)).astype(int)))
        if not events.empty: 
            sys_events.append(events)
    
    if sys_events:
        data = pd.concat(sys_events)
        temp = data["segment_name"].str.split(" - ", n=1, expand=True)
        data["start_activity"] = temp[0].replace(Short.ACTIVITY)
        data["end_activity"] = temp[1].replace(Short.ACTIVITY)
        data["type"] = Short.TYPE[event_type]
        
        if ext_label:
            data["label"] = data["start_activity"].astype(str) + "|" + data["end_activity"].astype(str) + " " + data["date_label"] + " " + data["type"] 
        else:
            data["label"] = data["start_activity"].astype(str) + "|" + data["end_activity"].astype(str) + "|" + data["type"]
    
    data = data[["start_activity", "end_activity", "start", "end", "type", "info", "label", "cases"]]
        
    return data

In [8]:
def compute_system_level_events(event_logs, tresholds, ext_label=True):
    events_dict = {}
    for i, log in enumerate(event_logs):
        print("computing events for log:", log[1])
        batching_end = system_level_events(log[0], Segments.SEGMENTS_COMPLETE, tresholds["batching_end"][i], "batching_end", ext_label)
        batching_start = system_level_events(log[0], Segments.SEGMENTS_COMPLETE, tresholds["batching_start"][i], "batching_start", ext_label)
        high_workload = system_level_events(log[0], Segments.SEGMENTS_HIGH_WORKLOAD, tresholds["high_workload"][i], "workload", ext_label)
        high_workload_hand = system_level_events(log[0], Segments.SEGMENTS_HIGH_WORKLOAD_HAND, tresholds["high_workload_hand"][i], "workload_hand", ext_label)
        delayed = system_level_events(log[0], Segments.SEGMENTS_COMPLETE, tresholds["delayed"], "delayed", ext_label)
    
        events = pd.concat([delayed, batching_end, batching_start, high_workload, high_workload_hand])
        events = remove_duplicate_events(events)
        events["id"] = range(len(events))
        events = events[["id", "start_activity", "end_activity", "start", "end", "type", "info", "label", "cases"]]
        events = rearrange_events(events)
        events_dict[log[1]] = events
    
    save_system_level_events(events_dict)
    
    return events_dict

In [9]:
def rearrange_events(events):
    segments = Segments.SEGMENTS_CASCADE0 + Segments.SEGMENTS_CASCADE1 + Segments.SEGMENTS_CASCADE2
    data = filter_system_level_events(events, segments)
    ordered_list = [(Short.ACTIVITY[segment.split(" - ")[0]], Short.ACTIVITY[segment.split(" - ")[1]]) for segment in segments]
    data["segment"] = list(zip(data["start_activity"], data["end_activity"]))
    data = data.sort_values('segment', key=lambda x: x.map({v:k for k, v in enumerate(ordered_list)}))
    data["id"] = range(len(data))
    data = data.reset_index()
    data = data[["id", "start_activity", "end_activity", "start", "end", "type", "info", "label", "cases"]]
    
    return data

In [10]:
def save_system_level_events(events):
    with open('output/dumps/events', 'wb+') as output:
        pickle.dump(events, output, -1)

In [11]:
def load_system_level_events():
    return pickle.load(open('output/dumps/events', 'rb'))

In [12]:
def filter_system_level_events(events, segments):
    data = events.copy()
    
    data["segment"] = list(zip(data["start_activity"], data["end_activity"]))
    lookup_segments = [(Short.ACTIVITY[segment.split(" - ")[0]], Short.ACTIVITY[segment.split(" - ")[1]]) for segment in segments]
    data = data[data["segment"].isin(lookup_segments)]
    
    data = data[["id", "start_activity", "end_activity", "start", "end", "type", "info", "label", "cases"]]
    
    return data

In [13]:
def add_adjacency_list(system_level_events, criteria, case_overlap):
    data = system_level_events.copy()

    if criteria == "loose":
        data["adjacency_list"] = data.apply(
        lambda x: [
            row["id"]
            for _, row in data[(x["end_activity"] == data["start_activity"])].iterrows()
            if (
                (
                    len(set(row["cases"]).intersection(x["cases"])) >= case_overlap
                ) and (
                    (x["end"][0] <= row["start"][0] <= x["end"][1]) 
                    or 
                    (row["start"][0] <= x["end"][0] <= row["start"][1])
                )
            )
        ],
        axis=1,
        )
    elif criteria == "strict":
        data["adjacency_list"] = data.apply(
        lambda x: [
            row["id"]
            for _, row in data[(x["end_activity"] == data["start_activity"])].iterrows()
            if (
                (
                    len(set(row["cases"]).intersection(x["cases"])) >= case_overlap
                ) and (
                    (x["end"][0] <= row["start"][0] <= row["start"][1] <= x["end"][1]) 
                    or 
                    (row["start"][0] <= x["end"][0] <= x["end"][1] <= row["start"][1])
                )
            )
        ],
        axis=1,
        )
    elif criteria == "very_strict":
        data["adjacency_list"] = data.apply(
        lambda x: [
            row["id"]
            for _, row in data[(x["end_activity"] == data["start_activity"])].iterrows()
            if (
                (
                    len(set(row["cases"]).intersection(x["cases"])) >= math.floor(min(len(row["cases"])/2, len(x["cases"])/2))
                ) and (
                    (x["end"][0] <= row["start"][0] <= row["start"][1] <= x["end"][1]) 
                    or 
                    (row["start"][0] <= x["end"][0] <= x["end"][1] <= row["start"][1])
                )
            )
        ],
        axis=1,
        )
    data = data[["id", "start_activity", "end_activity", "type", "info", "adjacency_list"]]
    
    return data

In [14]:
def strongly_connected_components(events):  
    graph = nx.DiGraph()
    for _, row in events.iterrows():
        graph.add_node(row["id"])
        for connection in row["adjacency_list"]:
            graph.add_edge(row["id"], connection)
    
    print(len(graph.edges()))
    ccomponents = sorted([c for c in nx.weakly_connected_components(graph) if (len(c) > 1)], key=len)
    ccomponents = [graph.subgraph(c).copy() for c in ccomponents]
    
    return ccomponents

In [15]:
def paths(graph):
    sources = [node for node, indegree in graph.in_degree(graph.nodes()) if indegree == 0]
    sinks = [node for node, outdegree in graph.out_degree(graph.nodes()) if outdegree == 0]
    paths = []
    for (source, sink) in [(source, sink) for sink in sinks for source in sources]:
        for path in nx.all_simple_paths(graph, source=source, target=sink):
            paths.append(path)
    
    return paths

In [16]:
def cascades(events):
    data = events.copy()
    data["compact"] = list(zip(data["id"], data["start_activity"] + "|" + data["end_activity"], data["type"], data["info"]))
    lookup = pd.Series(data["compact"].values, index=data["id"]).to_dict()
    ccomponents = strongly_connected_components(data)
    
    cascades = []
    for cc in ccomponents:
        p = [list(map(lambda x: lookup[x], path)) for path in paths(cc)]
        cascades += p
        
    return cascades

In [17]:
def detect_cascades(events, name, segment_groups, criteria="strict", case_overlap=5):
    cascade_list = []
    for i, segments in enumerate(segment_groups):
        data = filter_system_level_events(events.copy(), segments)
        data_ext = add_adjacency_list(data, criteria, case_overlap)
        c = cascades(data_ext)
        print(i, len(c))
        cascade_list += c
    
    save_cascades(cascade_list, name)

In [18]:
def is_subsequence(query, base):
    l = len(query)

    for i in range(len(base) - l + 1):
        if base[i:i+l] == query:
            return True
    return False

In [19]:
def filter_cascades(cascades, target):
    data = list(map(lambda c: list(map(lambda x: (x[0], x[1]), c)), cascades))
    filtered = list(filter(lambda c: is_subsequence(target, c), data))
    
    return filtered

In [20]:
def create_lookup(cascades):
    data = list(map(lambda c: list(map(lambda x: (x[0], (x[1] + "|" + x[2])), c)), cascades))
    flattened = [item for sublist in data for item in sublist]
    lookup = dict(flattened)
    return lookup

In [21]:
def get_frequencies(cascades, pattern):
    print(pattern)
    for subset in cascades.keys():
        print(subset + ":", end=" ")
        lookup = create_lookup(cascades[subset])
        ids = set([k for k,v in lookup.items() if v in pattern])
        
        data = list(map(lambda c: list(map(lambda x: x[0], c)), cascades[subset]))
        data = list(map(lambda x: tuple(set(x).intersection(ids)), data))
        data = list(filter(lambda x: len(x) == len(pattern), data))
        data = list(set(data))
        print(len(data), end=" ")
    print()

In [22]:
def get_clean_cascades(cascades, pattern):
    lookup = create_lookup(cascades)
    ids = set([k for k,v in lookup.items() if v in pattern])

    data = list(map(lambda c: list(map(lambda x: x[0], c)), cascades))
    data = list(map(lambda x: tuple(set(x).intersection(ids)), data))
    data = list(filter(lambda x: len(x) == len(pattern), data))
    data = list(set(data))
    data = [sorted(list(c)) for c in data]
    
    return data

In [23]:
def save_cascades(cascades, name):
    with open('output/dumps/cascades_' + name, 'wb+') as output:
        pickle.dump(cascades, output, -1)

In [24]:
def load_cascades(names):
    cascades = {}
    for name in names:
        c = pickle.load(open('output/dumps/cascades_' + name, 'rb'))
        cascades[name] = c
    
    return cascades

In [25]:
def patterns(cascades, count=10):
    df = pd.DataFrame([zip(*row) for row in cascades], columns=["ids", "cascades", "event_types", "values"])
    
    temp = df.groupby(['cascades', 'event_types'])['values']

    df = pd.concat(
        (
            temp.apply(lambda s: np.array(list(s)).mean(axis=0)),
            temp.size()
        ), axis=1,
        keys=['values', 'count']
    ).reset_index()
    
    df["values"] = df["values"].apply(lambda l: [round(x, 2) for x in l])

    df["cascades"] = df["cascades"].apply(lambda x: list(x))
    df["cascades"] = df["cascades"].apply(lambda x: [v for v, _ in groupby(chain.from_iterable(v.split("|") for v in x))])
    
    df["event_types"] = df["event_types"].apply(lambda x: list(x))
    
    df["len"] = df["event_types"].apply(lambda x: len(x)).astype(int)

    df["bat"] = df["event_types"].apply(lambda x: x.count("BAT") + x.count("BST") + x.count("BEN")).astype(int)
    df["wl"] = df["event_types"].apply(lambda x: x.count("WLD") + x.count("WLH")).astype(int)
    df["wlh"] = df["event_types"].apply(lambda x: x.count("WLH")).astype(int)
    df["wld"] = df["event_types"].apply(lambda x: x.count("WLD")).astype(int)
    df["del"] = df["event_types"].apply(lambda x: x.count("DEL")).astype(int)
    df["_bat"] = df["event_types"].apply(lambda x: x.count("BAT")).astype(int)
    df["bst"] = df["event_types"].apply(lambda x: x.count("BST")).astype(int)
    df["ben"] = df["event_types"].apply(lambda x: x.count("BEN")).astype(int)

    df["bat"] = df["bat"] / df["len"]
    df["non_bat"] = 1 - df["bat"]
    df["wl"] = df["wl"] / df["len"]
    df["wld"] = df["wld"] / df["len"]
    df["wlh"] = df["wlh"] / df["len"]
    df["del"] = df["del"] / df["len"]
    df["_bat"] = df["_bat"] / df["len"]
    df["bst"] = df["bst"] / df["len"]
    df["ben"] = df["ben"] / df["len"]
    
    df = df[df["count"] >= count].reset_index()
    df = df[["cascades", "event_types", "count", "len"]]
    
#     print("DEL", mean(df["del"].tolist()))
#     print("BAT", mean(df["_bat"].tolist()))
#     print("WLH", mean(df["wlh"].tolist()))
#     print("WLD", mean(df["wld"].tolist()))
#     print("BST", mean(df["bst"].tolist()))
#     print("BEN", mean(df["ben"].tolist()))
#     print(len(df))
    
#     segs = df["cascades"].tolist()
#     segs = [item for sublist in segs for item in sublist]
#     inv_lookup = {v:k for k,v in Short.ACTIVITY.items()}
#     segs = [inv_lookup[s] for s in segs]
#     cnt = Counter(segs)
#     labels, values = zip(*cnt.most_common())

#     indexes = np.arange(len(labels))
#     width = 1

#     fig, ax = plt.subplots(figsize=(11,11))
  
#     fig.savefig("heatmaps/" + "barch" + ".png", dpi=400)
#     plt.bar(indexes, values, width)
#     plt.xticks(indexes + width * 0.5, labels, rotation='vertical')
#     plt.show()
    
    return df

In [26]:
def to_graphs(patterns):
    pats = patterns.copy()
    graphs = []
    inv_lookup = {v:k for k,v in Short.ACTIVITY.items()}

    for _, row in pats.iterrows():
        graph = nx.DiGraph()
        nodes = [inv_lookup[node] for node in row["cascades"]]
        edges = [(i, j) for i, j in zip(nodes, nodes[1:])]
        edge_labels = [t + "|" + str(v) for (t, v) in zip(row["event_types"], row["values"])]

        for node in nodes:
            graph.add_node(node)
        for i, edge in enumerate(edges):
            graph.add_edge(edge[0], edge[1], label= " " + edge_labels[i])

        graph = nx.nx_agraph.to_agraph(graph)
        graphs.append(graph)

    for i, graph in enumerate(graphs):
        graph.layout('dot')            
        graph.draw('patterns2/pattern' + str(i) + '.png')

In [27]:
def detect_N_pattern(events):
    data = events.copy()
    patterns = []
    
    for segment in Segments.SEGMENTS_COMPLETE:
        start_activity, end_activity = segment.split(" - ", 1)

        subset = data[(data["start_activity"] == Short.ACTIVITY[start_activity]) & (data["end_activity"] == Short.ACTIVITY[end_activity])]
        if len(subset):
            dels = subset[subset["duration"] > 0]
            instants = subset[subset["duration"] == 0]
            
            for i, row in dels.iterrows():
                start = row["start_time"]
                end = row["end_time"]
                instant_start = instants[instants["start_time"] == row["start_time"]]
                instant_end = instants[instants["start_time"] == row["end_time"]]
                if (len(instant_start) > 0 and len(instant_end) > 0):
                    print(segment, row["start_time"], row["end_time"])
                    pattern = pd.concat([instant_start, subset[(subset["start_time"] == start) & (subset["end_time"] == end)], instant_end])
                    patterns.append(pattern)
                
    return patterns

In [28]:
def N_patterns_graph(patterns):
    inv_lookup = {v:k for k,v in Short.ACTIVITY.items()}
    graphs = []
    for pattern in patterns:
        start_a = inv_lookup[pattern.iloc[0]["start_activity"]]
        end_a = inv_lookup[pattern.iloc[0]["end_activity"]]
        start_t = pattern.iloc[1]["start_time"]
        end_t = pattern.iloc[1]["end_time"]
        
        graph = nx.DiGraph()
        
        nodes = [start_a + " " + start_t,
                 end_a + " " + start_t,
                 start_a + " " + end_t,
                 end_a + " " + end_t]
        
        edge_labels = ["dur: " + str(d) + ", #cases: " + str(c) for (d, c) in list(zip(pattern["duration"].tolist(), pattern["nr_cases"].tolist()))]

        for node in nodes:
            graph.add_node(node)
        for i, edge in enumerate([(nodes[0], nodes[1]), (nodes[0], nodes[3]), (nodes[2], nodes[3])]):
            graph.add_edge(edge[0], edge[1], label= " " + edge_labels[i])

        graph = nx.nx_agraph.to_agraph(graph)
        graphs.append(graph)
        
    return graphs

In [29]:
def batches(event_log, segments):
    sys_events = []

    for i, segment in enumerate(segments):
        seg_events = segment_level_events(event_log, segment)

        events = system_level_events_delayed(seg_events, treshold_occurence=53)

        events["start_date"] = list(zip(events["start_time"], events["start_time"]))
        events["end_date"] = list(zip(events["end_time"], events["end_time"]))
        startdates = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
        enddates = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%y%m%d"))
        start = events["start_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
        start = start - FIRST
        end = events["end_time"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
        end = end - FIRST
        events["date_label"] = startdates.astype(str) + "-" + enddates.astype(str)

        if not events.empty:
            delays = events[events["duration"] > 0]
            if len(delays) > 0:
                sys_events.append(events)        
    
    if sys_events:
        data = pd.concat(sys_events)
        new = data["segment_name"].str.split(" - ", n=1, expand=True)
        data["start_activity"] = new[0].replace(Short.ACTIVITY)
        data["end_activity"] = new[1].replace(Short.ACTIVITY)
        data["nr_cases"] = data["cases"].str.len()
    
    data = data[["start_activity", "end_activity", "start_time", "end_time", "duration", "nr_cases"]]
        
    return data

In [30]:
def pattern_statistics(event_log, cascade_list, pattern, cases, durations, index, scatter, hist):
    ratios = []
    for i, c in enumerate(cascade_list):
        start_cases = set(cases[c[0]])
        end_cases = set(cases[c[-1]])
        overlap = (start_cases.intersection(end_cases))
        ratios.append(len(overlap) / len(start_cases))
    
    if all(r >= 0.666 for r in ratios) or len(pattern) == 2:       
        print("---------------------------------------")
        print(pattern)
        print("---------------------------------------")
        not_pattern = cases_in_trace(event_log, pattern)
        _pattern = set()
        x = []
        y = []
        for i, c in enumerate(cascade_list):
            start_cases = set(cases[c[0]])
            end_cases = set(cases[c[-1]])
            overlap = start_cases.intersection(end_cases)

            not_pattern = not_pattern.difference(overlap)
            _pattern = _pattern.union(overlap)
            
            if scatter:
                x.append(len(overlap.intersection(set(successful_cases))))
                y.append(len(overlap.intersection(set(not_successful_cases))))
                
        if scatter:
            plt.scatter(x, y)
            plt.title((pattern[0] + "-" + pattern[1]).replace("|", "-"))
            plt.xlabel("#successful")
            plt.xlim(0, 180)
            plt.ylabel("#unsuccessful")
            plt.ylim(0, 180)
            plt.savefig("output/scatterplots2/" + str(index) + (pattern[0] + "-" + pattern[1]).replace("|", "-") + ".jpg")
            plt.clf()
            
        if hist:
            c = [durations[str(x)] for x in _pattern]
            _pattern_med_dur = median([x for x in c if x > 30])
            n, bins, patches = plt.hist(x=c, bins=range(0,150), alpha=0.7, rwidth=0.85)
            plt.grid(axis='y', alpha=0.75)
            plt.xlabel('Duration')
            plt.xlim(0, 150)
            plt.ylabel('Frequency')
            plt.title((pattern[0] + "-" + pattern[1]).replace("|", "-"))
            plt.savefig("output/histograms2/" + str(index) + (pattern[0] + "-" + pattern[1]).replace("|", "-") + ".jpg")
            plt.clf()
            
            c = [durations[str(x)] for x in not_pattern] 
            not_pattern_med_dur = median([x for x in c if x > 30])
            n, bins, patches = plt.hist(x=c, bins=range(0,150), alpha=0.7, rwidth=0.85)
            plt.grid(axis='y', alpha=0.75)
            plt.xlabel('Duration')
            plt.xlim(0, 150)
            plt.ylabel('Frequency')
            plt.title(("NOT" + pattern[0] + "-" + pattern[1]).replace("|", "-"))
            plt.savefig("output/histograms2/NOT_" + str(index) + (pattern[0] + "-" + pattern[1]).replace("|", "-") + ".jpg")
            plt.clf()
        
        with open("output/cases2/" + str(index) + (pattern[0] + "-" + pattern[1]).replace("|", "-"), 'wb+') as output:
            pickle.dump(_pattern, output, -1)
        
        with open("output/cases2/NOT_" + str(index) + (pattern[0] + "-" + pattern[1]).replace("|", "-"), 'wb+') as output:
            pickle.dump(not_pattern, output, -1)
            
        _pattern_succ = len(_pattern.intersection(set(successful_cases)))
        _pattern_nsucc = len(_pattern.intersection(set(not_successful_cases)))
        _pattern_fast = len(_pattern.intersection(set(_10days_cases)))
        _pattern_avg = len(_pattern.intersection(set(_1030days_cases)))
        _pattern_slow = len(_pattern.intersection(set(_30days_cases)))
        
        _pattern_succ_prob = _pattern_succ / len(_pattern)
        _pattern_nsucc_prob = _pattern_nsucc / len(_pattern)
        _pattern_fast_prob = _pattern_fast / len(_pattern)
        _pattern_avg_prob = _pattern_avg / len(_pattern)
        _pattern_slow_prob = _pattern_slow / len(_pattern)
        
        not_pattern_succ = len(not_pattern.intersection(set(successful_cases)))
        not_pattern_nsucc = len(not_pattern.intersection(set(not_successful_cases)))
        not_pattern_fast = len(not_pattern.intersection(set(_10days_cases)))
        not_pattern_avg = len(not_pattern.intersection(set(_1030days_cases)))
        not_pattern_slow = len(not_pattern.intersection(set(_30days_cases)))
        
        not_pattern_succ_prob = not_pattern_succ / len(not_pattern)
        not_pattern_nsucc_prob = not_pattern_nsucc / len(not_pattern)
        not_pattern_fast_prob = not_pattern_fast / len(not_pattern)
        not_pattern_avg_prob = not_pattern_avg / len(not_pattern)
        not_pattern_slow_prob = not_pattern_slow / len(not_pattern)
        
        succ_sign = get_significance([_pattern_succ, _pattern_nsucc], [not_pattern_succ, not_pattern_nsucc])
        dur_sign = get_significance([_pattern_fast, _pattern_avg, _pattern_slow], [not_pattern_fast, not_pattern_avg, not_pattern_slow])

        print(len(cascade_list), "CASCADES")
        print("PATTERN:     SUCC", _pattern_succ, "(" + str(round(_pattern_succ_prob*100, 1)), end="%)| ")
        print("NSUCC", _pattern_nsucc, "(" + str(round(_pattern_nsucc_prob*100, 1)), end="%)| ")
        print("FAST", _pattern_fast, "(" + str(round(_pattern_fast_prob*100, 1)), end="%)| ")
        print("AVG", _pattern_avg, "(" + str(round(_pattern_avg_prob*100, 1)), end="%)| ")
        print("SLOW", _pattern_slow, "(" + str(round(_pattern_slow_prob*100, 1)), end="%) (")
        print("MED", _pattern_med_dur, end=")| ") 
        print("(", len(_pattern), "CASES )")
        
        print("NOT PATTERN: SUCC", not_pattern_succ, "(" + str(round(not_pattern_succ_prob*100, 1)), end="%)| ")
        print("NSUCC", not_pattern_nsucc, "(" + str(round(not_pattern_nsucc_prob*100, 1)), end="%)| ")
        print("FAST", not_pattern_fast, "(" + str(round(not_pattern_fast_prob*100, 1)), end="%)| ")
        print("AVG", not_pattern_avg, "(" + str(round(not_pattern_avg_prob*100, 1)), end="%)| ")
        print("SLOW", not_pattern_slow, "(" + str(round(not_pattern_slow_prob*100, 1)), end="%) (")
        print("MED", not_pattern_med_dur, end=")| ") 
        print("(", len(not_pattern), "CASES )")
        
        print(("SUCC SIGN: YES" if succ_sign < 0.05 else "SUCC SIGN: NO"), end =" ") 
        print("(p-value: " + str(succ_sign) + ")")
        print(("DUR SIGN: YES" if dur_sign < 0.05 else "DUR SIGN: NO"), end =" ") 
        print("(p-value: " + str(dur_sign) + ")")       
    else:      
        for i, j in zip(range(len(pattern)), range(len(pattern))[1:]):
            new_pattern = [pattern[i], pattern[j]]
            pattern_statistics(event_log, [[c[i], c[j]] for c in cascade_list], new_pattern, cases, durations, index, scatter, hist)

In [31]:
def pattern_statistics2(event_log, cascade_list, pattern, cases, durations, index, data, first=True):
    ratios = []
    df = data.copy()
    for i, c in enumerate(cascade_list):
        start_cases = set(cases[c[0]])
        end_cases = set(cases[c[-1]])
        overlap = (start_cases.intersection(end_cases))
        ratios.append(len(overlap) / len(start_cases))
    
    if all(r >= 0.666 for r in ratios) or len(pattern) == 2:
        if first:
            print("||", index, "||")
            df.at[index, 'maximal'] = True
        print("---------------------------------------")
        print(pattern)
        print("---------------------------------------")
        not_pattern = cases_in_trace(event_log, pattern)
        _pattern = set()

        for i, c in enumerate(cascade_list):
            start_cases = set(cases[c[0]])
            end_cases = set(cases[c[-1]])
            overlap = start_cases.intersection(end_cases)

            not_pattern = not_pattern.difference(overlap)
            _pattern = _pattern.union(overlap)
                
        c = [durations[str(x)] for x in _pattern]
        _pattern_med_dur = median([x for x in c if x > 30])
        c = [durations[str(x)] for x in not_pattern] 
        not_pattern_med_dur = median([x for x in c if x > 30])
            
        _pattern_succ = len(_pattern.intersection(set(successful_cases)))
        _pattern_nsucc = len(_pattern.intersection(set(not_successful_cases)))
        _pattern_fast = len(_pattern.intersection(set(_10days_cases)))
        _pattern_avg = len(_pattern.intersection(set(_1030days_cases)))
        _pattern_slow = len(_pattern.intersection(set(_30days_cases)))
        
        _pattern_succ_prob = _pattern_succ / len(_pattern)
        _pattern_nsucc_prob = _pattern_nsucc / len(_pattern)
        _pattern_fast_prob = _pattern_fast / len(_pattern)
        _pattern_avg_prob = _pattern_avg / len(_pattern)
        _pattern_slow_prob = _pattern_slow / len(_pattern)
        
        not_pattern_succ = len(not_pattern.intersection(set(successful_cases)))
        not_pattern_nsucc = len(not_pattern.intersection(set(not_successful_cases)))
        not_pattern_fast = len(not_pattern.intersection(set(_10days_cases)))
        not_pattern_avg = len(not_pattern.intersection(set(_1030days_cases)))
        not_pattern_slow = len(not_pattern.intersection(set(_30days_cases)))
        
        not_pattern_succ_prob = not_pattern_succ / len(not_pattern)
        not_pattern_nsucc_prob = not_pattern_nsucc / len(not_pattern)
        not_pattern_fast_prob = not_pattern_fast / len(not_pattern)
        not_pattern_avg_prob = not_pattern_avg / len(not_pattern)
        not_pattern_slow_prob = not_pattern_slow / len(not_pattern)
        
        succ_sign = get_significance([_pattern_succ, _pattern_nsucc], [not_pattern_succ, not_pattern_nsucc])
        dur_sign = get_significance([_pattern_fast, _pattern_avg, _pattern_slow], [not_pattern_fast, not_pattern_avg, not_pattern_slow])

        print(len(cascade_list), "CASCADES")
        print("PATTERN:     SUCC", _pattern_succ, "(" + str(round(_pattern_succ_prob*100, 1)), end="%)| ")
        print("NSUCC", _pattern_nsucc, "(" + str(round(_pattern_nsucc_prob*100, 1)), end="%)| ")
        print("FAST", _pattern_fast, "(" + str(round(_pattern_fast_prob*100, 1)), end="%)| ")
        print("AVG", _pattern_avg, "(" + str(round(_pattern_avg_prob*100, 1)), end="%)| ")
        print("SLOW", _pattern_slow, "(" + str(round(_pattern_slow_prob*100, 1)), end="%) (")
        print("MED", _pattern_med_dur, end=")| ") 
        print("(", len(_pattern), "CASES )")
        
        print("NOT PATTERN: SUCC", not_pattern_succ, "(" + str(round(not_pattern_succ_prob*100, 1)), end="%)| ")
        print("NSUCC", not_pattern_nsucc, "(" + str(round(not_pattern_nsucc_prob*100, 1)), end="%)| ")
        print("FAST", not_pattern_fast, "(" + str(round(not_pattern_fast_prob*100, 1)), end="%)| ")
        print("AVG", not_pattern_avg, "(" + str(round(not_pattern_avg_prob*100, 1)), end="%)| ")
        print("SLOW", not_pattern_slow, "(" + str(round(not_pattern_slow_prob*100, 1)), end="%) (")
        print("MED", not_pattern_med_dur, end=")| ") 
        print("(", len(not_pattern), "CASES )")
        
        print(("SUCC SIGN: YES" if succ_sign < 0.05 else "SUCC SIGN: NO"), end =" ") 
        print("(p-value: " + str(succ_sign) + ")")
        print(("DUR SIGN: YES" if dur_sign < 0.05 else "DUR SIGN: NO"), end =" ") 
        print("(p-value: " + str(dur_sign) + ")")
        
        if first:
            df.at[index, "nr_cases"] = len(_pattern)
    else:      
        for i, j in zip(range(len(pattern)), range(len(pattern))[1:]):
            new_pattern = [pattern[i], pattern[j]]
            pattern_statistics2(event_log, [[c[i], c[j]] for c in cascade_list], new_pattern, cases, durations, index, df, first=False)
    
    return df

In [32]:
def get_significance(pattern_list, not_pattern_list):
    stats = importr('stats')
    m = np.array([pattern_list, not_pattern_list])
    res = stats.fisher_test(m, simulate_p_value=True, conf_level = 0.95)

    return(res[0][0])

In [33]:
def compute_patterns_statistics(event_log, events, raw_cascades, patterns, scatter=False, hist=False):
    cases = pd.Series(events["complete"]["cases"].values, index=events["complete"]["id"]).to_dict()
    durations = pickle.load(open('output/dumps/durations', 'rb'))
    
    for i, pattern in enumerate(patterns):
        print("_________________________________________")
        print("Complete pattern:", pattern)
        cascade_list = get_clean_cascades(raw_cascades, pattern)
        pattern_statistics(event_log, cascade_list, pattern, cases, durations, i, scatter, hist)

In [34]:
def compute_patterns_statistics2(event_log, events, raw_cascades, patterns):
    cases = pd.Series(events["complete"]["cases"].values, index=events["complete"]["id"]).to_dict()
    durations = pickle.load(open('output/dumps/durations', 'rb'))
    df = patterns.copy()
    df["nr_cases"] = 0
    df["maximal"] = False
    
    for i in range(len(patterns)):
        pattern_str = pattern_string(patterns, i)
        print("_________________________________________")
        print(i, "Complete pattern:", pattern_str)
        
        cascade_list = get_clean_cascades(raw_cascades, pattern_str)
        df = pattern_statistics2(event_log, cascade_list, pattern_str, cases, durations, i, df, True)
        
    return df

In [35]:
def cases_in_trace(event_log, pattern):
    inv_lookup = {v:k for k,v in Short.ACTIVITY.items()}
    segments = list(map(lambda x: inv_lookup[x.split("|")[0]] + " - " + inv_lookup[x.split("|")[1]], pattern))
    cases_list = []
    
    for segment in segments:
        seg_events = segment_level_events(event_log, segment)
        cases = set(seg_events["case_id"].tolist())
        cases_list.append(cases)

    inter = set.intersection(*cases_list)

    return inter

In [36]:
def check_findings(event_log, subset_cases, segments):
    print(segments)
    cases_sets = []
    for segment in segments:
        segs = segment_level_events(event_log, segment)
        cases_set = set(segs["case_id"].tolist())
        cases_sets.append(cases_set)
        
    segments_subset = set.intersection(*cases_sets)
    print("#cases in subset:", len(subset_cases))
    
#     segments_subset = set(subset_cases).intersection(segments_cases)
    print("#cases containing sequence:", len(segments_subset))
    segments_succ = len(segments_subset.intersection(successful_cases))
    segments_succ_ratio = round(segments_succ / len(segments_subset) * 100, 1)
    segments_nsucc = len(segments_subset.intersection(not_successful_cases))
    segments_nsucc_ratio = round(segments_nsucc / len(segments_subset) * 100, 1)
    
    segments_fast = len(segments_subset.intersection(_10days_cases))
    segments_fast_ratio = round(segments_fast / len(segments_subset) * 100, 1)
    segments_av = len(segments_subset.intersection(_1030days_cases))
    segments_av_ratio = round(segments_av / len(segments_subset) * 100, 1)
    segments_slow = len(segments_subset.intersection(_30days_cases))
    segments_slow_ratio = round(segments_slow / len(segments_subset) * 100, 1)
    
    segments_med_dur = median([durations[str(x)] for x in segments_subset])
    print('SUCC', segments_succ, "(" + str(segments_succ_ratio) + "%) | NSUCC", segments_nsucc, "(" + str(segments_nsucc_ratio) + '%) | FAST', segments_fast, "(" + str(segments_fast_ratio) + '%) | AV', segments_av, "(" + str(segments_av_ratio) + '%) | SLOW', segments_slow, "(" + str(segments_slow_ratio) + "%) | MED DUR", segments_med_dur)
    
    not_segments_subset = set(all_cases).difference(segments_subset)
    print("#cases not containing sequence:", len(not_segments_subset))
    not_segments_succ = len(not_segments_subset.intersection(successful_cases))
    not_segments_succ_ratio = round(not_segments_succ / len(not_segments_subset) * 100, 1)
    not_segments_nsucc = len(not_segments_subset.intersection(not_successful_cases))
    not_segments_nsucc_ratio = round(not_segments_nsucc / len(not_segments_subset) * 100, 1)
    
    not_segments_fast = len(not_segments_subset.intersection(_10days_cases))
    not_segments_fast_ratio = round(not_segments_fast / len(not_segments_subset) * 100, 1)
    not_segments_av = len(not_segments_subset.intersection(_1030days_cases))
    not_segments_av_ratio = round(not_segments_av / len(not_segments_subset) * 100, 1)
    not_segments_slow = len(not_segments_subset.intersection(_30days_cases))
    not_segments_slow_ratio = round(not_segments_slow / len(not_segments_subset) * 100, 1)
    
    not_segments_med_dur = median([durations[str(x)] for x in not_segments_subset])
    print('SUCC', not_segments_succ, "(" + str(not_segments_succ_ratio) + "%) | NSUCC", not_segments_nsucc, "(" + str(not_segments_nsucc_ratio) + '%) | FAST', not_segments_fast, "(" + str(not_segments_fast_ratio) + '%) | AV', not_segments_av, "(" + str(not_segments_av_ratio) + '%) | SLOW', not_segments_slow, "(" + str(not_segments_slow_ratio) + "%) | MED DUR", not_segments_med_dur)
    print()

In [37]:
def pattern_string(patterns, index):
    p = patterns.copy()
    p["cascades"] = p["cascades"].apply(lambda activities: [(i + "|" + j) for i, j in zip(activities, activities[1:])])
    p["cascades"] = p.apply(lambda x: list(zip(x["cascades"], x["event_types"])), axis=1)
    p["cascades"] = p["cascades"].apply(lambda pairs: [(pair[0] + "|" + pair[1]) for pair in pairs])
    
    return p["cascades"].tolist()[index]