In [None]:
DIST_LOG_PATH = "/Users/law/repos/ma/benchmark-runs/matrix_dist_all"
SINGLE_LOG_PATH = "/Users/law/repos/ma/benchmark-runs/matrix_single_all"

In [None]:
import os
import re

BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*)")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
ALL_LOGS_RE = re.compile(r"All logs can be found in (.*)")
LOG_DIR_RE = re.compile(r"Latencies in dir: (.*)")
QUARTER_RE = re.compile(r"Running with quarter events/s: (\d+)")
HALF_RE = re.compile(r"Running with half events/s: (\d+)")
THREE_QUARTER_RE = re.compile(r"Running with three quarter events/s: (\d+)")
FULL_RE = re.compile(r"Running with full events/s: (\d+)")
LATENCY_RE = re.compile(r"Latency for window.* windowStartTimestamp=(\d+), windowEndTimestamp=(\d+).* --> (\d+)")


def parse_root_log(log_dir):
    latencies = []
    root_file = os.path.join(LOG_PATH, log_dir, "root.log")
    
    with open(root_file) as f:
        for line in f:
            latency_match = LATENCY_RE.match(line)
            if latency_match is not None:
                window_start = int(latency_match.group(1))
                window_end = int(latency_match.group(2))
                latency = int(latency_match.group(3))
                latencies.append((window_start, window_end, latency))
    return latencies


def parse_latencies(f):
    latencies = {}
    
    runs = [
        ("25", QUARTER_RE), 
        ("50", HALF_RE), 
        ("75", THREE_QUARTER_RE),
    ]
    
    for percentage, matcher in runs:
        percentage_line = next(f)
        assert matcher.match(percentage_line) is not None, \
                    f"{matcher} ({percentage}%) did not match line: {percentage_line}"
        dummy_line = next(f)

        error = False
        log_dir_match = None
        while log_dir_match is None:
            log_line = next(f)
            if "counting as unsustainable" in log_line:
                print(f" '--> {percentage}% unsustainable, no latencies.")
                error = True
                break
            log_dir_match = LOG_DIR_RE.match(log_line)
            
        if error:
            continue

        assert log_dir_match is not None, f"line {log_line} did not match ({percentage})"
        absolute_log_dir = log_dir_match.group(1)
        log_dir = os.path.basename(absolute_log_dir)
        latencies[percentage] = parse_root_log(log_dir)

    return latencies


def find_sustainable_run_logs(all_logs_dir, num_events):
    log_dir = os.path.join(LOG_PATH, all_logs_dir) 
    matching_dirs = [os.path.basename(logs) for logs in os.listdir(log_dir) if str(num_events) in logs]
    matching_dirs.sort()
    assert matching_dirs
    run_logs = matching_dirs[-1]
    return os.path.join(all_logs_dir, run_logs)


def parse_log_file(log_file):
    all_latencies = {}
    current_run = None
    current_throughput = None
    
    with open(log_file) as f:
        while True:
            try:
                line = next(f)
            except StopIteration:
                break
                
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                curr_windows, curr_agg_fn = (benchmark_match.group(1), benchmark_match.group(2))
                if "-" in curr_agg_fn:
                    hyphen_pos = curr_agg_fn.find("-")
                    curr_agg_fn = curr_agg_fn[:hyphen_pos - 1]
                current_bm = (curr_windows, curr_agg_fn)
                all_latencies[current_bm] = {}
                print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                assert current_run is None, f"Did not find logs for {current_run}"
                current_run = (int(run_match.group(3)), int(run_match.group(4)))
                print(f"{current_run[0]} child(ren), {current_run[1]} stream(s)")
                continue
                
            throughput_match = THROUGHPUT_RE.match(line)
            if throughput_match is not None:
                if current_throughput is not None:
                    print(f"Did not find run line after {current_run}")
                current_throughput = int(throughput_match.group(1))

            all_logs_match = ALL_LOGS_RE.match(line)
            if all_logs_match is not None:
                assert current_run is not None
                all_latencies[current_bm][current_run] = parse_latencies(f)
                
                abs_all_logs_dir = all_logs_match.group(1)
                all_logs_dir = os.path.basename(abs_all_logs_dir).replace(".", "")
                full_run_logs = find_sustainable_run_logs(all_logs_dir, current_throughput)
                all_latencies[current_bm][current_run]["100"] = parse_root_log(full_run_logs)
                current_run = None
                current_throughput = None
                continue

    return all_latencies
                

def get_latencies_for_mode(log_path):
    print(f"\n\n===\nPROCESSING {log_path}\n===")
    global LOG_PATH
    LOG_PATH = log_path
    mode_latencies = defaultdict(dict)
    for log_file in sorted(os.listdir(log_path)):
        if log_file.endswith(".log"):
            print(f"Parsing {log_file}")
            abs_log_file = os.path.join(log_path, log_file)
            latencies = parse_log_file(abs_log_file)
            for bm, lats in latencies.items():
                mode_latencies[bm].update(lats)
    return mode_latencies

DIST_ALL_LATENCIES = get_latencies_for_mode(DIST_LOG_PATH)
print(DIST_LATENCIES)
SINGLE_ALL_LATENCIES = get_latencies_for_mode(SINGLE_LOG_PATH)

In [None]:
WINDOW_START = 30000
WINDOW_END   = 90000

def filter_latencies(all_unfiltered_latencies):
    all_filtered_latencies = {}
    for benchmark, all_runs in all_unfiltered_latencies.items():
        all_filtered_latencies[benchmark] = {}
        for run, all_latencies in all_runs.items():
            filtered_latencies = []
            for percentage, latencies in all_latencies.items():
                percentage_latencies = []
                for latency in latencies:
                    if latency[0] >= WINDOW_START and latency[1] <= WINDOW_END:
                        percentage_latencies.append(latency[2])

                if len(percentage_latencies) < 60:
                    print(f"Missing latencies for {benchmark} {run} {percentage}%...")
                    filtered_latencies.append([])
                else:
                    filtered_latencies.append(percentage_latencies)

            all_filtered_latencies[benchmark][run] = filtered_latencies
    return all_filtered_latencies
        
print("Filtering dist latencies")
DIST_LATENCIES = filter_latencies(DIST_ALL_LATENCIES)

print("Filtering single latencies")
SINGLE_LATENCIES = filter_latencies(SINGLE_ALL_LATENCIES)

# Plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rcParams
import pprint
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False})

In [None]:
def get_single_child_latencies(group_latencies):
    num_child_streams = [(1, 1), (1, 2), (1, 4), (1, 8),]
    return [(num_cs, group_latencies[num_cs]) for num_cs in num_child_streams]

def get_multi_child_latencies(group_latencies):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8),]
    return [(num_cs, group_latencies[num_cs]) for num_cs in num_child_streams]

def plot_latencies(group_latencies, title):   
    fig, ax = plt.subplots()
    fig.set_tight_layout(False)
    # print(group_latencies)
    
    bar_width = 0.20
    x_locations = [0, 1, 2, 3]
    
    ordered_latencies = defaultdict(list)
    
    for num_cs, latencies in group_latencies:
        print(num_cs)
        ordered_latencies[25].append(latencies[0])
        ordered_latencies[50].append(latencies[1])
        ordered_latencies[75].append(latencies[2])
        ordered_latencies[100].append(latencies[3])
        
#     print(ordered_latencies)
    for i, (percentage, latencies) in enumerate(sorted(ordered_latencies.items())):
        mean_latencies = [np.median(lats) for lats in latencies]
        print(percentage, mean_latencies)
        stddev_latencies = [np.std(lats) for lats in latencies]
        ith_x_locations = [x + (i*bar_width) for x in x_locations]
#         print(f"ith_x_locations: {ith_x_locations}, mean_latencies: {mean_latencies}, "
#               "bar_width: {bar_width}, stddev_latencies: {stddev_latencies}")
        ax.bar(ith_x_locations, mean_latencies, bar_width, bottom=0, yerr=stddev_latencies, label=f"{percentage}%")

    ax.set_title(title)
    ax.set_ylabel("latency in ms")
    ax.set_xlabel("#children, #streams")
    ax.set_xticks([(x - (bar_width / 2)) + (bar_width * 2) for x in x_locations])
    ax.set_xticklabels([cs[0] for cs in group_latencies])
    ax.legend()
    ax.set_yscale('log')
    ax.set_ylim(ymin=0, ymax=1000)

    plt.show()
    
    
def group_latency_mode(benchmarks):
    groups = defaultdict(dict)
    for benchmark, run_latencies in benchmarks.items():
        group, agg_fn = benchmark
        print(f"Adding benchmark {benchmark} to group {group}")
        groups[group][agg_fn] = run_latencies
    return groups
    
def group_latency_benchmarks(dist_bms, single_bms):
    groups = {}
    groups["distributed"] = group_latency_mode(dist_bms)
    groups["centralized"] = group_latency_mode(single_bms)
    
    print("\n\nGROUPS:")
    for mode, bms in groups.items():
        for bm, group in bms.items():
            for agg_fn, runs in sorted(group.items()):
                print(f"{mode} - {bm} - {agg_fn} - {sorted(runs.keys())}")
    print()
    return groups

def plot_mode_latencies(latencies):
    for benchmark, all_runs in DIST_LATENCIES.items():
        group, agg_fn = benchmark
        for (num_children, num_streams), latencies in all_runs.items():
            child_str = "Child" if num_children == 1 else "Children"
            stream_str = "Stream" if num_streams == 1 else "Streams"
            plot_latencies(latencies, f"{benchmark} {num_children} {child_str}, {num_streams} {stream_str}")

In [None]:
groups = group_latency_benchmarks(DIST_LATENCIES, SINGLE_LATENCIES)
dist_groups = groups['distributed']
single_groups = groups['centralized']

for bm in groups['distributed'].keys():
    for agg_fn in dist_groups[bm]:
        print(f"Plotting {bm} - {agg_fn}")
        dist_bms = dist_groups[bm][agg_fn]
        single_bms = single_groups[bm][agg_fn]
    
#         dist_single_child = get_single_child_latencies(dist_bms)
#         single_single_child = get_single_child_latencies(single_bms)
#         plot_latencies(dist_single_child, f"Distributed - {bm} - {agg_fn}")
#         plot_latencies(single_single_child, f"Centralized - {bm} - {agg_fn}")

        dist_multi_child = get_multi_child_latencies(dist_bms)
        single_multi_child = get_multi_child_latencies(single_bms)
        plot_latencies(dist_multi_child, f"Distributed - {bm} - {agg_fn}")
        plot_latencies(single_multi_child, f"Centralized - {bm} - {agg_fn}")