In [2]:
def duration_treshold(event_log, segment, percentage, slow=True):
    """
    Returns a treshold for the duration of a segment in an event log, based on an input percentage

    :param event_log: event log used (Performance_Spectrum.EventLog)
    :param segment: Name of the input segment of which we want a treshold
    :param percentage: percentage of segments level events to include
    :param slow: Boolean that indicates whether we want unusually slow or fast
    :return: duration treshold based on percentage to derive system level events from
    """
    events = segment_level_events(event_log, segment)
    data = events["duration"].values
    
    if slow:
        perc = np.percentile(data, 100 - percentage)
        treshold = math.floor(perc + 1)
    else:
        perc = np.percentile(data, percentage)
        treshold = max(0, math.ceil(perc - 1))
    
    return treshold

In [3]:
def batching_start_treshold(event_log, segment, percentile):
    """
    Returns a batching on start treshold for a segment in an event log, based on an input percentage

    :param event_log: event log used (Performance_Spectrum.EventLog)
    :param segment: Name of the input segment of which we want a treshold
    :param percentage: percentage of segments level events to include
    :return: batching on start treshold based on percentage to derive system level events from
    """
    events = segment_level_events(event_log, segment)
    
    if events.empty:
        return 1000
    
    data = events['start_time'].value_counts().reindex(DATE_RANGE, fill_value=0).tolist()
    perc = np.percentile(data, percentile)
    
    return perc

In [4]:
def batching_end_treshold(event_log, segment, percentile):
    """
    Returns a batching on end treshold for a segment in an event log, based on an input percentage

    :param event_log: event log used (Performance_Spectrum.EventLog)
    :param segment: Name of the input segment of which we want a treshold
    :param percentage: percentage of segments level events to include
    :return: batching on end treshold based on percentage to derive system level events from
    """
    events = segment_level_events(event_log, segment)
    
    if events.empty:
        return 1000
    
    data = events['end_time'].value_counts().reindex(DATE_RANGE, fill_value=0).tolist()
    perc = np.percentile(data, percentile)

    return perc

In [5]:
def high_workload_treshold(event_log, segment, percentile, same_user=True):
    """
    Returns a high workload treshold for a segment in an event log, based on an input percentage

    :param event_log: event log used (Performance_Spectrum.EventLog)
    :param segment: Name of the input segment of which we want a treshold
    :param percentage: percentage of segments level events to include
    :return: high workload treshold based on percentage to derive system level events from
    """
    events = system_level_events_high_workload(segment_level_events(event_log, segment))
    
    if events.empty:
        return 1000
    
    if same_user:
        data = list(events["ratio_workload"])
    else:
        data = list(events["ratio_start_end"])
    
    perc = np.percentile(data, percentile)

    return perc

In [6]:
def compute_tresholds(event_logs):
    # Duration tresholds for delayed system level events
    tresholds_delayed = [duration_treshold(event_logs[0][0], segment, 30) for segment in Segments.SEGMENTS_COMPLETE]

    # Tresholds for batching on end system level events  
    tresholds_batching_end_complete = [batching_end_treshold(event_logs[0][0], segment, 90) for segment in Segments.SEGMENTS_COMPLETE]
    
    # Tresholds for batching on start system level events
    tresholds_batching_start_complete = [batching_start_treshold(event_logs[0][0], segment, 90) for segment in Segments.SEGMENTS_COMPLETE]
    
    # Tresholds for high workload system level events
    tresholds_high_workload_complete = [high_workload_treshold(event_logs[0][0], segment, 90) for segment in Segments.SEGMENTS_HIGH_WORKLOAD]
    
    # Tresholds for high workload handover system level events
    tresholds_high_workload_hand_complete = [high_workload_treshold(event_logs[0][0], segment, 90, False) for segment in Segments.SEGMENTS_HIGH_WORKLOAD_HAND]
    
    
    # Combine different batching on start/end and high workload tresholds
    tresholds_batching_end = [tresholds_batching_end_complete
                              ]
    
    tresholds_batching_start = [tresholds_batching_start_complete
                               ]
    
    tresholds_high_workload = [tresholds_high_workload_complete
                              ]
    
    tresholds_high_workload_hand = [tresholds_high_workload_hand_complete
                                   ]
    
    tresholds = {"delayed": tresholds_delayed,
                 "batching_end": tresholds_batching_end, 
                 "batching_start": tresholds_batching_start, 
                 "high_workload": tresholds_high_workload, 
                 "high_workload_hand": tresholds_high_workload_hand}
    
    save_tresholds(tresholds)
    
    return tresholds

In [7]:
def save_tresholds(tresholds):
    with open('output/dumps/tresholds', 'wb+') as output:
        pickle.dump(tresholds, output, -1)

In [8]:
def load_tresholds():
    return pickle.load(open('output/dumps/tresholds', 'rb'))

In [None]:
def jenks_buckets(values, start, end):
    for i in range(start, end + 1):
        print(i, ":", jenks(values, i))

In [9]:
def goodness_of_variance_fit(array, classes):
    # get the break points
    classes = jenks(array, classes)

    # do the actual classification
    classified = np.array([classify(i, classes) for i in array])

    # max value of zones
    maxz = max(classified)

    # nested list of zone indices
    zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)]

    # sum of squared deviations from array mean
    sdam = np.sum((array - array.mean()) ** 2)

    # sorted polygon stats
    array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices]

    # sum of squared deviations of class means
    sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort])

    # goodness of variance fit
    gvf = (sdam - sdcm) / sdam

    return gvf

In [10]:
def classify(value, breaks):
    for i in range(1, len(breaks)):
        if value < breaks[i]:
            return i
    return len(breaks) - 1