In [None]:
import os
import re
import pprint
from collections import defaultdict

RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (\w+)( - (DISTRIBUTED|SINGLE_NODE))?")

def parse_log_file(log_file):
    sustainable_throughputs = {}
    
    current_bm = None
    current_run = None
    current_throughput = None
    with open(log_file) as f:
        for line in f:
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                curr_windows, curr_agg_fn = benchmark_match.group(1), benchmark_match.group(2)
                curr_mode = benchmark_match.group(4)
                curr_mode = curr_mode if curr_mode is not None else "DISTRIBUTED"
                current_bm = (curr_windows, curr_agg_fn, curr_mode)
                sustainable_throughputs[current_bm] = {}
                current_throughput = None
                # print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                if current_run != None:
                    print(f"Did not find candidate line for {current_run}")
                current_run = (int(run_match.group(3)), int(run_match.group(4)))
                current_throughput = None
                # print(current_run)

            throughput_match = THROUGHPUT_RE.match(line)
            if throughput_match is not None:
                if current_throughput is not None:
                    print(f"Did not find run line after {current_run}")
                current_throughput = int(throughput_match.group(1))
                sustainable_throughputs[current_bm][current_run] = current_throughput
                current_run = None
                
    if current_run is not None:
        print(f"Did not find candidate line for {current_run}")
                
    return sustainable_throughputs
                
def get_all_throughputs(log_path):
    all_throughputs = defaultdict(dict)
    for log_file in sorted(os.listdir(log_path)):
        if log_file.endswith(".log"):
            print(f"Parsing {log_file}")
            sustainable_throughputs = parse_log_file(os.path.join(log_path, log_file))
#             print(f"current: {sustainable_throughputs}")
            for bm, tps in sustainable_throughputs.items():
                all_throughputs[bm].update(tps)
#             print(f"all:     {all_throughputs}\n")
    return all_throughputs

In [None]:
BASE_LOG_DIR = "/Users/law/repos/ma/benchmark-runs"

def merge_paths(paths):
    merged_tp = defaultdict(dict)
    for path in paths:
        abs_path = os.path.join(BASE_LOG_DIR, path)
        tp = get_all_throughputs(abs_path)
        for bm, tps in tp.items():
            merged_tp[bm].update(tps)
    pprint.pprint(merged_tp)
    return merged_tp

print("CONCURRENT")
CONCURRENT_PATHS = [
    "concurrent_tumbling_20"
]
CONCURRENT_TP = merge_paths(CONCURRENT_PATHS)
 
print("MATRIX")
MATRIX_PATHS = [
    "matrix_dist_all",
    "matrix_single_all",
]
MATRIX_TP = merge_paths(MATRIX_PATHS)

In [None]:
def print_throughputs(all_throughputs):
    for benchmark, run_throughputs in sorted(all_throughputs.items()):
        print(f"Benchmark {benchmark}")
        for (num_children, num_streams), throughput in sorted(run_throughputs.items()):
            print(f"Total sustainable throughput for {num_children} child(ren) with " \
                  f"{num_streams // num_children} stream(s) each " \
                  f"is {(throughput * num_streams // num_children): >7d} events/s per child.")
        print()
    

print_throughputs(CONCURRENT_TP)
print_throughputs(MATRIX_TP)

# Plots

In [None]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False, 'font.size': 14, 'lines.linewidth': 3})
plt.style.use('seaborn-deep')
FORMATS = ["b-o", "g--x", "r-^", "c-<", "m-+", "k-*",]

### Plot Sustainable Throughput

In [None]:
def plot_throughput_group(child_streams, throughputs, title):
    formats = iter(FORMATS)
    sorted_throughputs = sorted(throughputs.items())
    print(f"sorted_tps: {sorted_throughputs}")
    for agg_fn, tp in sorted_throughputs:
        plt.plot(tp, next(formats), label=agg_fn)
        
    plt.legend()
    str_child_streams = [str(cs) for cs in child_streams]
    plt.xticks(range(len(child_streams)), str_child_streams)
    plt.ylabel("events/s in mio.")
    plt.xlabel("#children, # streams")
    plt.title(title)
    plt.ylim(ymin=0)
    plt.show()
#     plt.savefig(f"/tmp/plots/single_child_{title[0].replace(',', '_')}_{title[1]}.png")
#     plt.close()

def group_throughput_mode(bms, num_child_streams):
    groups = defaultdict(dict)
    for benchmark, run_throughputs in bms:
        group, agg_fn, mode = benchmark
        group_key = (group, mode)
        print(f"Adding benchmark {benchmark} to group {group_key}")
        bm_throughputs = []
        for num_cs in num_child_streams:
            num_streams = num_cs[1] 
            throughput = run_throughputs[num_cs]
            bm_throughputs.append((num_streams * throughput) / 1_000_000)
        groups[group_key][agg_fn] = bm_throughputs
    return groups
    
def group_throughput_benchmarks(bms, num_child_streams):
    all_groups = group_throughput_mode(sorted(bms.items()), num_child_streams)
    
    groups = {}
    groups["distributed"] = {run: tps for run, tps in all_groups.items() if run[1] == "DISTRIBUTED"}
    groups["centralized"] = {run: tps for run, tps in all_groups.items() if run[1] == "SINGLE_NODE"}
    
#     print("\n\n")
#     print(f"NODE_CONFIG: {num_child_streams}")
#     print("\nGROUPS:")
#     pprint.pprint(groups)
    return groups

In [None]:
def plot_throughput_benchmarks(bms, num_child_streams):
    groups = group_throughput_benchmarks(bms, num_child_streams)
    for mode, sub_groups in groups.items():
        for group, throughputs in sub_groups.items():
            plot_throughput_group(num_child_streams, throughputs, group)
        
def plot_multi_child_throughputs(bms):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8)]
    plot_throughput_benchmarks(bms, num_child_streams)
    
def plot_single_child_throughputs(bms):
    num_child_streams = [(1, 1), (1, 2), (1, 4), (1, 8)]
    plot_throughput_benchmarks(bms, num_child_streams)

           
plot_single_child_throughputs(MATRIX_TP)
plot_multi_child_throughputs(MATRIX_TP)

In [None]:
def plot_throughput_by_agg_fn(child_streams, throughputs):
    print(throughputs)
    formats = iter(FORMATS)
    data = defaultdict(list)
    for mode, run in sorted(throughputs):
        for agg_fn, tps in run.items():
            data[agg_fn].append(tps)

    print(data)
#     dist_format = next(formats)
#     single_format = next(formats)
    for agg_fn, (dist, single) in data.items():
        if agg_fn != "MAX": continue
        plt.plot([1, 2, 4, 8], dist, label="distributed max", marker="o", ms=7)
        plt.plot([1, 2, 4, 8], single, label="centralized max", marker="^", ms=8)
#         plt.plot([1, 2, 4, 8], 4 * [1.036601], label="max throughput", ls="--")
        plt.legend()
#         str_child_streams = [str(cs[0]) for cs in child_streams]
#         plt.xticks(range(8))
        plt.ylabel("events/s in mio.")
        plt.xlabel("# input streams")
#         plt.title(agg_fn)
        plt.ylim(ymin=0, ymax=1.1 * max(dist))
        plt.xlim(xmin=0.5)
        plt.savefig(f"/tmp/plots/tumbling_single_node_{agg_fn}.png")
        plt.savefig(f"/tmp/plots/tumbling_single_node_{agg_fn}.pdf")
        plt.show()

def plot_tumbling_scale(bms):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8)]
    groups = group_throughput_benchmarks(bms, num_child_streams)
    groups = [(bm[1], tps) for mode, g in groups.items() for bm, tps in g.items() if bm[0].startswith("TUMBLING")]
    plot_throughput_by_agg_fn(num_child_streams, groups)
    
def plot_tumbling_single_node(bms):
    num_child_streams = [(1, 1), (1, 2), (1, 4), (1, 8)]
    groups = group_throughput_benchmarks(bms, num_child_streams)
    groups = [(bm[1], tps) for mode, g in groups.items() for bm, tps in g.items() if bm[0].startswith("TUMBLING")]
    plot_throughput_by_agg_fn(num_child_streams, groups)
    

# plot_tumbling_scale(MATRIX_TP)
plot_tumbling_single_node(MATRIX_TP)

In [None]:
def plot_concurrent_throughput_group(num_windows, throughputs):
    markers = iter(['o', '^', 'x', '*'])
    print(throughputs)
    for (agg_fn, mode), tps in sorted(throughputs):
        if agg_fn == "M_MEDIAN": continue
        mode_str = "distributed" if mode == "DISTRIBUTED" else "centralized"
        plt.plot(num_windows, tps, marker=next(markers), ms=8, label=f"{mode_str} - {agg_fn.replace('M_', '').lower()}")

    plt.ylabel("events/s in mio.")
    plt.xlabel("# concurrent windows")
    plt.legend()
    plt.xscale("log")
    plt.ylim(ymin=0)
    plt.savefig(f"/tmp/plots/concurrent_decomposable.png")
    plt.savefig(f"/tmp/plots/concurrent_decomposable.pdf")
    plt.show()
    
bm_throughputs = defaultdict(list)
bm_num_windows = set()
for benchmark, run_throughputs in sorted(CONCURRENT_TP.items(), key=lambda x: int(x[0][0].split(",")[1])):
    print(f"Benchmark {benchmark}")
    num_windows = int(benchmark[0].split(",")[1])
    bm_num_windows.add(num_windows)
    agg_fn = benchmark[1]
    mode = benchmark[2]
    for throughput in run_throughputs.values(): 
        bm_throughputs[(agg_fn, mode)].append(throughput / 1_000_000)

# print(bm_throughputs)

plot_concurrent_throughput_group(sorted(bm_num_windows), bm_throughputs.items())

# for (agg_fn, mode), tps in sorted(bm_throughputs.items()):
#     plot_concurrent_throughput_group(sorted(bm_num_windows), (agg_fn.replace('M_', '').lower(), mode, tps), f"{agg_fn.replace('M_', '')} - {mode}")
    # plt.savefig(f"/tmp/plots/concurrent_max_tumbling.png")
    # plt.close()