In [None]:
DIST_LOG_PATH = "/Users/law/repos/ma/benchmark-runs/matrix_dist_all"
SINGLE_LOG_PATH = "/Users/law/repos/ma/benchmark-runs/matrix_single_all"
ALL_LOG_PATH = "/Users/law/repos/ma/benchmark-runs/latency_multi_child"
TUMBLING_PATH = "/Users/law/repos/ma/benchmark-runs/latency_tumbling_max_median"
SESSION_PATH = "/Users/law/repos/ma/benchmark-runs/sessions"

In [None]:
import os
import re
from collections import defaultdict

BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (\w+)( - (DISTRIBUTED|SINGLE_NODE))?")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
ALL_LOGS_RE = re.compile(r"All logs can be found in (.*)")
LOG_DIR_RE = re.compile(r"Latencies in dir: (.*)")
QUARTER_RE = re.compile(r"Running with quarter events/s: (\d+)")
HALF_RE = re.compile(r"Running with half events/s: (\d+)")
THREE_QUARTER_RE = re.compile(r"Running with three quarter events/s: (\d+)")
FULL_RE = re.compile(r"Running with full events/s: (\d+)")
LATENCY_RE = re.compile(r"Latency for window.* windowStartTimestamp=(\d+), windowEndTimestamp=(\d+).* --> (\d+)")


def parse_root_log(log_dir):
    latencies = []
    root_file = os.path.join(LOG_PATH, log_dir, "root.log")
    
    with open(root_file) as f:
        for line in f:
            latency_match = LATENCY_RE.match(line)
            if latency_match is not None:
                window_start = int(latency_match.group(1))
                window_end = int(latency_match.group(2))
                latency = int(latency_match.group(3))
                if window_end - window_start == 1000: continue
                latencies.append((window_start, window_end, latency))
    return latencies


def parse_latencies(f):
    latencies = {}
    
    runs = [
        ("25", QUARTER_RE), 
        ("50", HALF_RE), 
        ("75", THREE_QUARTER_RE),
        ("100", FULL_RE)
    ]
    
    for percentage, matcher in runs:
        percentage_line = next(f)
        while matcher.match(percentage_line) is None:
            percentage_line = next(f)
        dummy_line = next(f)

        error = False
        log_dir_match = None
        while log_dir_match is None:
            log_line = next(f)
            if "counting as unsustainable" in log_line:
                print(f" '--> {percentage}% unsustainable, no latencies.")
                error = True
                break
            log_dir_match = LOG_DIR_RE.match(log_line)
            
        if error:
            continue

        assert log_dir_match is not None, f"line {log_line} did not match ({percentage})"
        absolute_log_dir = log_dir_match.group(1)
        log_dir = os.path.basename(absolute_log_dir)
        latencies[percentage] = parse_root_log(log_dir)
        if not latencies[percentage]:
            print(f"LATENCIES EMPTY {percentage}% - {log_dir}")

    return latencies


def find_sustainable_run_logs(all_logs_dir, num_events):
    log_dir = os.path.join(LOG_PATH, all_logs_dir) 
    matching_dirs = [os.path.basename(logs) for logs in os.listdir(log_dir) if str(num_events) in logs]
    matching_dirs.sort()
    assert matching_dirs
    run_logs = matching_dirs[-1]
    return os.path.join(all_logs_dir, run_logs)


def parse_log_file(log_file):
    all_latencies = defaultdict(dict)
    current_run = None
    current_throughput = None
    
    with open(log_file) as f:
        while True:
            try:
                line = next(f)
            except StopIteration:
                break
                
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                curr_windows, curr_agg_fn = benchmark_match.group(1), benchmark_match.group(2)
                curr_mode = benchmark_match.group(4)
                curr_mode = curr_mode if curr_mode is not None else "DISTRIBUTED"
                current_bm = (curr_windows, curr_agg_fn, curr_mode)
                print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                current_run = (int(run_match.group(3)), int(run_match.group(4)))
                print(f"{current_run[0]} child(ren), {current_run[1]} stream(s)")
                all_latencies[current_bm][current_run] = parse_latencies(f)

    return all_latencies
                

def get_latencies(log_path):
    print(f"\n\n===\nPROCESSING {log_path}\n===")
    global LOG_PATH
    LOG_PATH = log_path
    mode_latencies = defaultdict(dict)
    for log_file in sorted(os.listdir(log_path)):
        if log_file.endswith(".log"):
            print(f"Parsing {log_file}")
            abs_log_file = os.path.join(log_path, log_file)
            latencies = parse_log_file(abs_log_file)
            for bm, lats in latencies.items():
                mode_latencies[bm].update(lats)
    return mode_latencies

ALL_LATENCIES = get_latencies(SESSION_PATH)
# print(ALL_LATENCIES)

In [None]:
WINDOW_START = 30000
WINDOW_END   = 90000

def filter_latencies(all_unfiltered_latencies):
    all_filtered_latencies = {}
    for benchmark, all_runs in all_unfiltered_latencies.items():
        all_filtered_latencies[benchmark] = {}
        for run, all_latencies in all_runs.items():
            filtered_latencies = []
            for percentage, latencies in all_latencies.items():
                percentage_latencies = []
                for latency in latencies:
                    if latency[0] >= WINDOW_START and latency[1] <= WINDOW_END:
                        percentage_latencies.append(latency[2])

#                 if len(percentage_latencies) < 60:
#                     print(f"Missing latencies for {benchmark} {run} {percentage}%...")
#                     filtered_latencies.append([])
#                 else:
                filtered_latencies.append(percentage_latencies)

            all_filtered_latencies[benchmark][run] = filtered_latencies
    return all_filtered_latencies
        
# print("Filtering dist latencies")
# DIST_LATENCIES = filter_latencies(DIST_ALL_LATENCIES)

# print("Filtering single latencies")
# SINGLE_LATENCIES = filter_latencies(SINGLE_ALL_LATENCIES)

print("Filtering latencies")
print(ALL_LATENCIES)
LATENCIES = filter_latencies(ALL_LATENCIES)
print(LATENCIES)

# Plots

In [None]:
from matplotlib import rcParams
import numpy as np
import matplotlib.pyplot as plt
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False, 'font.size': 14, 'lines.linewidth': 3})
plt.style.use('seaborn-deep')
import pprint

In [None]:
def get_single_child_latencies(group_latencies):
    num_child_streams = [(1, 1), (1, 2), (1, 4), (1, 8),]
    return [(num_cs, group_latencies[num_cs]) for num_cs in num_child_streams]

def get_multi_child_latencies(group_latencies):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8),]
    return [(num_cs, group_latencies[num_cs]) for num_cs in num_child_streams]

def plot_latencies(group_latencies, title):   
    fig, ax = plt.subplots()
    fig.set_tight_layout(False)
    
    bar_width = 0.20
    x_locations = [0, 1, 2, 3]

    latencies_by_percentage = []
    for i in range(len(group_latencies)):
        latencies_by_percentage.append([])
    
    for i, bm_latencies in enumerate(group_latencies):
        if len(bm_latencies) != 4:
            print(f"BAD CONFIG:\n{i}, {bm_latencies}")
            latencies_by_percentage[0].append([0])
            latencies_by_percentage[1].append([0])
            latencies_by_percentage[2].append([0])
            latencies_by_percentage[3].append([0])
            continue
        latencies_by_percentage[0].append(bm_latencies[0])
        latencies_by_percentage[1].append(bm_latencies[1])
        latencies_by_percentage[2].append(bm_latencies[2])
        latencies_by_percentage[3].append(bm_latencies[3])
            
#     print("lats %")
#     print(latencies_by_percentage)
    
    for i, percentage_lats in enumerate(latencies_by_percentage):
        percentage = (i+1) * 25
        mean_latencies = [np.median(lats) - 100 for lats in percentage_lats]
        print(percentage, mean_latencies)
        stddev_latencies = [np.std(lats) for lats in percentage_lats]
#         stddev_latencies = [0] * len(percentage_lats) if i < 3 else [np.std(lats) for lats in percentage_lats]
        ith_x_locations = [x + (i*bar_width) for x in x_locations]
#         print(f"ith_x_locations: {ith_x_locations}, mean_latencies: {mean_latencies}, "
#               "bar_width: {bar_width}, stddev_latencies: {stddev_latencies}")
        ax.bar(ith_x_locations, mean_latencies, bar_width, bottom=0, label=f"{percentage}%", yerr=stddev_latencies)

#     ax.set_title(title)
    ax.set_ylabel("latency in ms")
    ax.set_xlabel("# children")
    ax.set_xticks([(x - (bar_width / 2)) + (bar_width * 2) for x in x_locations])
    ax.set_xticklabels(["1", "2", "4", "8"])
    ax.legend()
    ax.set_yscale('log')
    ax.set_ylim(ymin=1, ymax=1300)

#     plt.savefig(f"/tmp/plots/latency_{title}.png", bbox_inches="tight")
#     plt.savefig(f"/tmp/plots/latency_{title}.pdf", bbox_inches="tight") 
    plt.show()
    
    
def group_latency_mode(benchmarks):
    groups = defaultdict(dict)
    for benchmark, run_latencies in benchmarks.items():
        group, agg_fn, mode = benchmark
        group_key = (group, mode)
        print(f"Adding benchmark {benchmark} to group {group_key}")
        groups[group_key][agg_fn] = run_latencies
    return groups
    
def group_latency_benchmarks(bms):
    all_groups = group_latency_mode(bms)
    groups = {}
    groups["distributed"] = {run: tps for run, tps in all_groups.items() if run[1] == "DISTRIBUTED"}
    groups["centralized"] = {run: tps for run, tps in all_groups.items() if run[1] == "SINGLE_NODE"}
    
    print("\n\nGROUPS:")
    for mode, bms in groups.items():
        for bm, group in bms.items():
            for agg_fn, runs in sorted(group.items()):
                print(f"{mode} - {bm} - {agg_fn} - {sorted(runs.keys())}")
    print()
    return groups

In [None]:
def group_latency_mode(bms, num_child_streams):
#     print(bms)
    groups = defaultdict(dict)
    for benchmark, run_latencies in bms:
#         print(benchmark, run_latencies)
        group, agg_fn, mode = benchmark
        group_key = (group, mode)
        print(f"Adding benchmark {benchmark} to group {group_key}")
        bm_latencies = []
        for num_cs in num_child_streams:
            latencies = run_latencies[num_cs]
            bm_latencies.append(latencies)
        groups[group_key][agg_fn] = bm_latencies
    return groups
    
def group_latency_benchmarks(bms, num_child_streams):
    all_groups = group_latency_mode(sorted(bms.items()), num_child_streams)
    
    groups = {}
    groups["distributed"] = {run: lats for run, lats in all_groups.items() if run[1] == "DISTRIBUTED"}
    groups["centralized"] = {run: lats for run, lats in all_groups.items() if run[1] == "SINGLE_NODE"}
    
    print("\n\n")
    print(f"NODE_CONFIG: {num_child_streams}")
#     print("\nGROUPS:")
#     pprint.pprint(groups)
    return groups

def plot_latency_benchmarks(bms, num_child_streams):
    groups = group_latency_benchmarks(bms, num_child_streams)
    for mode, benchmarks in groups.items():
        for group, all_latencies in benchmarks.items():
            for agg_fn, latencies in all_latencies.items():
#                 if agg_fn != 'MAX': continue
#                 if mode != "centralized": continue
                plot_latencies(latencies, f"{agg_fn}-{mode}")
        
def plot_multi_child_latencies(bms):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8)]
    plot_latency_benchmarks(bms, num_child_streams)

    
plot_multi_child_latencies(LATENCIES)

In [None]:
def plot_latencies(group_latencies, title):   
    fig, ax = plt.subplots()
    fig.set_tight_layout(False)
    
    bar_width = 0.20
    x_locations = [0, 1, 2]
    

    latencies_by_percentage = []
    for i in range(4):
        latencies_by_percentage.append([])
    
    for i, bm_latencies in enumerate(group_latencies):
        print("len: ", len(bm_latencies))
        if len(bm_latencies) != 4:
            print(f"BAD CONFIG:\n{i}, {bm_latencies}")
            latencies_by_percentage[0].append(bm_latencies[0])
            latencies_by_percentage[1].append(bm_latencies[1])
            latencies_by_percentage[2].append(bm_latencies[2])
            latencies_by_percentage[3].append([0])
            continue
        latencies_by_percentage[0].append(bm_latencies[0])
        latencies_by_percentage[1].append(bm_latencies[1])
        latencies_by_percentage[2].append(bm_latencies[2])
        latencies_by_percentage[3].append(bm_latencies[3])
    
    for i, percentage_lats in enumerate(latencies_by_percentage):
        percentage = (i+1) * 25
        mean_latencies = [np.median(lats) / 1000 for lats in percentage_lats]
        print(percentage, mean_latencies)
        stddev_latencies = [np.std(lats) / 1000 for lats in percentage_lats]
        ith_x_locations = [x + (i*bar_width) for x in x_locations]
        ax.bar(ith_x_locations, mean_latencies, bar_width, bottom=0, label=f"{percentage}%", yerr=stddev_latencies)

    ax.set_ylabel("latency in seconds")
    ax.set_xlabel("height of balanced tree (# total nodes)")
    ax.set_xticks([(x - (bar_width / 2)) + (bar_width * 2) for x in x_locations])
    ax.set_xticklabels(("2 (3)", "3 (7)", "4 (15)"))
    ax.legend()
    ax.set_ylim(ymin=1, ymax=45)

#     plt.savefig(f"/tmp/plots/session_latency_{title}.pdf", bbox_inches="tight") 
    fig.show()
    
def group_latency_mode(bms, num_child_streams):
#     print(bms)
    groups = defaultdict(dict)
    for benchmark, run_latencies in bms:
#         print(benchmark, run_latencies)
        group, agg_fn, mode = benchmark
        group_key = (group, mode)
        print(f"Adding benchmark {benchmark} to group {group_key}")
        bm_latencies = []
        for num_cs in num_child_streams:
            latencies = run_latencies[num_cs]
            bm_latencies.append(latencies)
        groups[group_key][agg_fn] = bm_latencies
    return groups
    
def group_latency_benchmarks(bms, num_child_streams):
    all_groups = group_latency_mode(sorted(bms.items()), num_child_streams)
    
    groups = {}
    groups["distributed"] = {run: lats for run, lats in all_groups.items() if run[1] == "DISTRIBUTED"}
    groups["centralized"] = {run: lats for run, lats in all_groups.items() if run[1] == "SINGLE_NODE"}
    
    print("\n\n")
    print(f"NODE_CONFIG: {num_child_streams}")
#     print("\nGROUPS:")
#     pprint.pprint(groups)
    return groups

def plot_latency_benchmarks(bms, num_child_streams):
    groups = group_latency_benchmarks(bms, num_child_streams)
    for mode, benchmarks in groups.items():
        for group, all_latencies in benchmarks.items():
            for agg_fn, latencies in all_latencies.items():
#                 if mode != "centralized": continue
                plot_latencies(latencies, f"{mode}")
    
def plot_tree_latencies(bms):
    num_child_streams = [(2, 2), (4, 4), (8, 8)]
    plot_latency_benchmarks(bms, num_child_streams)
    
# plot_multi_child_latencies(LATENCIES)
plot_tree_latencies(LATENCIES)