In [None]:
import os
import re
import pprint

RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*)(- (DISTRIBUTED|SINGLE_NODE))?")

def parse_log_file(log_file):
    sustainable_throughputs = {}
    
    current_bm = None
    current_run = None
    current_throughput = None
    with open(log_file) as f:
        for line in f:
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                current_bm = (benchmark_match.group(1), benchmark_match.group(2))
                sustainable_throughputs[current_bm] = {}
                current_throughput = None
#                 print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                if current_run != None:
                    print(f"Did not find candidate line for {current_run}")
                current_run = (int(run_match.group(3)), int(run_match.group(4)))
                current_throughput = None
#                 print(current_run)

            throughput_match = THROUGHPUT_RE.match(line)
            if throughput_match is not None:
                if current_throughput is not None:
                    print(f"Did not find run line after {current_run}")
                current_throughput = int(throughput_match.group(1))
                sustainable_throughputs[current_bm][current_run] = current_throughput
                current_run = None
                
    if current_run is not None:
        print(f"Did not find candidate line for {current_run}")
                
    return sustainable_throughputs
                
def get_all_throughputs(log_path):
    all_throughputs = {}
    for log_file in sorted(os.listdir(log_path)):
        if log_file.endswith(".log"):
            print(f"Parsing {log_file}")
            sustainable_throughputs = parse_log_file(os.path.join(log_path, log_file))
#             print(f"current: {sustainable_throughputs}")
            all_throughputs = {**all_throughputs, **sustainable_throughputs}
#             print(f"all:     {all_throughputs}\n")
    return all_throughputs

In [None]:
def merge_paths(paths):
    merged_tp = {}
    for path in paths:
        tp = get_all_throughputs(path)
        merged_tp = {**merged_tp, **tp}
    pprint.pprint(merged_tp)
    return merged_tp

print("CONCURRENT")
CONCURRENT_PATHS = [
    "/Users/law/repos/ma/benchmark-runs/concurrent_partial",
    "/Users/law/repos/ma/benchmark-runs/concurrent_rest",
    "/Users/law/repos/ma/benchmark-runs/concurrent_tumbling",
    "/Users/law/repos/ma/benchmark-runs/concurrent_tumbling_high_bad",
]
CONCURRENT_TP = merge_paths(CONCURRENT_PATHS)
 
print("DISTRIBUTED MATRIX")
DIST_MATRIX_PATHS = [
    "/Users/law/repos/ma/benchmark-runs/matrix_run",
    "/Users/law/repos/ma/benchmark-runs/matrix_median"
]
DIST_MATRIX_TP = merge_paths(DIST_MATRIX_PATHS)

print("SINGLE NODE MATRIX")
SINGLE_MATRIX_PATHS = [
    "/Users/law/repos/ma/benchmark-runs/matrix_median_single"
]
SINGLE_MATRIX_TP = merge_paths(SINGLE_MATRIX_PATHS)

In [None]:
def print_throughputs(all_throughputs):
    for benchmark, run_throughputs in sorted(all_throughputs.items()):
        print(f"Benchmark {benchmark}")
        for (num_children, num_streams), throughput in sorted(run_throughputs.items()):
            print(f"Total sustainable throughput for {num_children} child(ren) with " \
                  f"{num_streams // num_children} stream(s) each " \
                  f"is {(throughput * num_streams // num_children): >7d} events/s per child.")
        print()
    

print_throughputs(CONCURRENT_TP)
print_throughputs(MATRIX_TP)

# Plots

In [None]:
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False})

### Plot Sustainable Throughput

In [None]:
import matplotlib.pyplot as plt

def plot_single_child_throughputs(child_streams, throughputs, title):
    str_child_streams = [str(cs) for cs in child_streams]
    plt.plot(throughputs)
    plt.xticks(range(len(child_streams)), str_child_streams)
    plt.ylabel("events/s in mio.")
    plt.xlabel("#children, # streams")
    plt.title(title)
    plt.ylim(ymin=0, ymax=1.2)
    plt.show()
#     plt.savefig(f"/tmp/plots/single_child_{title[0].replace(',', '_')}_{title[1]}.png")
#     plt.close()

    
for benchmark, run_throughputs in sorted(MATRIX_TP.items()):
    print(f"Benchmark {benchmark}")
    bm_child_streams = []
    bm_throughputs = []
    for (num_children, num_streams), throughput in sorted(run_throughputs.items()):
        if num_children != 1:
            continue
        bm_child_streams.append((num_children, num_streams)) 
        bm_throughputs.append((num_streams * throughput) / 1_000_000)
    print(bm_child_streams)
    print(bm_throughputs)
    plot_single_child_throughputs(bm_child_streams, bm_throughputs, benchmark)

In [None]:
import matplotlib.pyplot as plt


def plot_multi_child_throghputs(child_streams, throughputs, title):
    plt.bar(range(len(throughputs)), throughputs)
    plt.ylabel("events/sin mio.")
    plt.xlabel("(#children, #streams)")
    plt.xticks(range(len(child_streams)), [f"({n_child}, {n_stream})" for n_child, n_stream in child_streams])
    plt.title(title)
    plt.ylim(ymin=0)
#     plt.savefig(f"/tmp/plots/{title[0].replace(',', '_')}_{title[1]}.png")
    plt.show()

    plt.plot(throughputs)
    plt.ylabel("events/sin mio.")
    plt.xlabel("(#children, #streams)")
    plt.xticks(range(len(child_streams)), [f"({n_child}, {n_stream})" for n_child, n_stream in child_streams])
    plt.title(title)
    plt.ylim(ymin=0)
    plt.show()

for benchmark, run_throughputs in sorted(MATRIX_TP.items()):
    print(f"Benchmark {benchmark}")
    bm_child_streams = []
    bm_throughputs = []
    for (num_children, num_streams), throughput in sorted(run_throughputs.items()):
        if num_children != num_streams:
            continue
        bm_child_streams.append((num_children, num_streams)) 
        bm_throughputs.append((num_streams * throughput) / 1_000_000)
    print(bm_child_streams)
    print(bm_throughputs)
    plot_multi_child_throghputs(bm_child_streams, bm_throughputs, benchmark)


In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict

def plot_concurrent_throughputs(num_windows, throughputs, title):
    print(list(zip(num_windows, throughputs)))
    plt.plot(num_windows, throughputs)
#     plt.xticks(range(num_windows))
    plt.ylabel("events/s in mio.")
    plt.xlabel("#concurrent windows")
    plt.title(title)
    plt.xscale("log")
    plt.ylim(ymin=0) #, ymax=1.2)
    plt.show()

    
bm_throughputs = defaultdict(list)
bm_num_windows = set()
for benchmark, run_throughputs in sorted(CONCURRENT_TP.items(), key=lambda x: int(x[0][0].split(",")[1])):
    print(f"Benchmark {benchmark}")
    print(run_throughputs)
    num_windows = int(benchmark[0].split(",")[1])
    bm_num_windows.add(num_windows)
    agg_fn = benchmark[1]
    for throughput in run_throughputs.values(): 
        bm_throughputs[agg_fn].append(throughput / 1_000_000)

print(bm_throughputs)    
plot_concurrent_throughputs(sorted(bm_num_windows)[:7], bm_throughputs["MAX"][:7], "MAX")
# plt.savefig(f"/tmp/plots/concurrent_max_tumbling.png")
# plt.close()

print(bm_throughputs)    
plot_concurrent_throughputs(sorted(bm_num_windows)[:7], bm_throughputs["M_AVG"][:7], "AVG")
# plt.savefig(f"/tmp/plots/concurrent_avg_tumbling.png")
# plt.close()

plot_concurrent_throughputs(sorted(bm_num_windows)[:7], bm_throughputs["M_MEDIAN"][:7], "MEDIAN")
# plt.savefig(f"/tmp/plots/concurrent_median_tumbling.png")
# plt.close()

# for agg_fn, tps in bm_throughputs.items():
#     plot_concurrent_throughputs(sorted(bm_num_windows), tps, agg_fn)