In [None]:
LOG_PATH="/Users/law/repos/ma/benchmark-runs/matrix_run"

In [None]:
import os
import re

BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*)")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
RUN_RE = re.compile(r"Running (\d+) child.*, (\d+) stream.*")
ALL_LOGS_RE = re.compile(r"All logs can be found in (.*)")
LOG_DIR_RE = re.compile(r"Latencies in dir: (.*)")
QUARTER_RE = re.compile(r"Running with quarter events/s: (\d+)")
HALF_RE = re.compile(r"Running with half events/s: (\d+)")
THREE_QUARTER_RE = re.compile(r"Running with three quarter events/s: (\d+)")
FULL_RE = re.compile(r"Running with full events/s: (\d+)")
LATENCY_RE = re.compile(r"Latency for window.* windowStartTimestamp=(\d+), windowEndTimestamp=(\d+).* --> (\d+)")


def parse_root_log(log_dir):
    latencies = []
    root_file = os.path.join(LOG_PATH, log_dir, "root.log")
    
    with open(root_file) as f:
        for line in f:
            latency_match = LATENCY_RE.match(line)
            if latency_match is not None:
                window_start = int(latency_match.group(1))
                window_end = int(latency_match.group(2))
                latency = int(latency_match.group(3))
                latencies.append((window_start, window_end, latency))
    return latencies


def parse_latencies(f):
    latencies = {}
    
    runs = [
        ("25", QUARTER_RE), 
        ("50", HALF_RE), 
        ("75", THREE_QUARTER_RE),
    ]
    
    for percentage, matcher in runs:
        percentage_line = next(f)
        assert matcher.match(percentage_line) is not None, \
                    f"{matcher} ({percentage}%) did not match line: {percentage_line}"
        dummy_line = next(f)

        error = False
        log_dir_match = None
        while log_dir_match is None:
            log_line = next(f)
            if "counting as unsustainable" in log_line:
                print(f" '--> {percentage}% unsustainable, no latencies.")
                error = True
                break
            log_dir_match = LOG_DIR_RE.match(log_line)
            
        if error:
            continue

        assert log_dir_match is not None, f"line {log_line} did not match ({percentage})"
        absolute_log_dir = log_dir_match.group(1)
        log_dir = os.path.basename(absolute_log_dir)
        latencies[percentage] = parse_root_log(log_dir)

    return latencies


def find_sustainable_run_logs(all_logs_dir, num_events):
    log_dir = os.path.join(LOG_PATH, all_logs_dir) 
    matching_dirs = [os.path.basename(logs) for logs in os.listdir(log_dir) if str(num_events) in logs]
    matching_dirs.sort()
    assert matching_dirs
    run_logs = matching_dirs[-1]
    return os.path.join(all_logs_dir, run_logs)


def parse_log_file(log_file):
    all_latencies = {}
    current_run = None
    current_throughput = None
    
    with open(log_file) as f:
        while True:
            try:
                line = next(f)
            except StopIteration:
                break
                
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                current_bm = (benchmark_match.group(1), benchmark_match.group(2))
                all_latencies[current_bm] = {}
                print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                assert current_run is None, f"Did not find logs for {current_run}"
                current_run = (int(run_match.group(1)), int(run_match.group(2)))
                print(f"{current_run[0]} child(ren), {current_run[1]} stream(s)")
                continue
                
            throughput_match = THROUGHPUT_RE.match(line)
            if throughput_match is not None:
                if current_throughput is not None:
                    print(f"Did not find run line after {current_run}")
                current_throughput = int(throughput_match.group(1))

            all_logs_match = ALL_LOGS_RE.match(line)
            if all_logs_match is not None:
                assert current_run is not None
                all_latencies[current_bm][current_run] = parse_latencies(f)
                
                abs_all_logs_dir = all_logs_match.group(1)
                all_logs_dir = os.path.basename(abs_all_logs_dir).replace(".", "")
                full_run_logs = find_sustainable_run_logs(all_logs_dir, current_throughput)
                all_latencies[current_bm][current_run]["100"] = parse_root_log(full_run_logs)
                current_run = None
                current_throughput = None
                continue

    return all_latencies
                

ALL_LATENCIES = {}
for log_file in sorted(os.listdir(LOG_PATH)):
    if log_file.endswith(".log"):
        print(f"Parsing {log_file}")
        abs_log_file = os.path.join(LOG_PATH, log_file)
        latencies = parse_log_file(abs_log_file)
        ALL_LATENCIES = {**ALL_LATENCIES, **latencies}

print(f"ALL RUNS: {ALL_LATENCIES.keys()}")
print(f"\nALL_LATENCIES:\n{ALL_LATENCIES}")
        

In [None]:
WINDOW_START = 30000
WINDOW_END   = 90000

LATENCIES = {}
for benchmark, all_runs in ALL_LATENCIES.items():
    LATENCIES[benchmark] = {}
    for run, all_latencies in all_runs.items():
        filtered_latencies = []
        for percentage, latencies in all_latencies.items():
            percentage_latencies = []
            for latency in latencies:
                if latency[0] >= WINDOW_START and latency[1] <= WINDOW_END:
                    percentage_latencies.append(latency[2])

            if len(percentage_latencies) < 60:
                print(f"Missing latencies for {benchmark} {run} {percentage}%...")
                filtered_latencies.append([])
            else:
                filtered_latencies.append(percentage_latencies)

        LATENCIES[benchmark][run] = filtered_latencies
        
print(LATENCIES)

# Plots

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False})

In [None]:
def plot_latencies(latencies, title):
#     latencies = [max(lat) for lat in latencies]
#     plt.bar(range(len(latencies)), latencies)
#     print(latencies)
    
    plt.boxplot(latencies, showfliers=False) #, whis=[0, 90])
    plt.title(title)
    plt.ylabel("latency in ms")
    plt.xlabel("throughput percentage")
    plt.ylim(ymin=0)
    plt.xticks(range(6), ["", "25%", "50%", "75%", "100%"])
    plt.savefig(f"/tmp/plots/latency_{title.replace(' ', '_')}.png")
    plt.close()
#     plt.show()

for benchmark, all_runs in LATENCIES.items():
    for (num_children, num_streams), latencies in all_runs.items():
        child_str = "Child" if num_children == 1 else "Children"
        stream_str = "Stream" if num_streams == 1 else "Streams"
        plot_latencies(latencies, f"{benchmark} {num_children} {child_str}, {num_streams} {stream_str}")

# if save_fig:
#     plt.savefig(f"load_{out_file_name}.svg")
#     plt.savefig(f"load_{out_file_name}.pgf")
#     plt.savefig(f"load_{out_file_name}.png")
# plt.show()