In [None]:
import os
import re
import pprint
from collections import defaultdict

RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
THROUGHPUT_RE = re.compile(r"Found sustainable candidate \((\d+) events/s.\)*")
BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*)")

def parse_log_file(log_file):
    sustainable_throughputs = {}
    
    current_bm = None
    current_run = None
    current_throughput = None
    with open(log_file) as f:
        for line in f:
            benchmark_match = BENCHMARK_RE.match(line)
            if benchmark_match is not None:
                curr_windows, curr_agg_fn = benchmark_match.group(1), benchmark_match.group(2)
                if "-" in curr_agg_fn:
                    hyphen_pos = curr_agg_fn.find("-")
                    curr_agg_fn = curr_agg_fn[:hyphen_pos - 1]
                current_bm = (curr_windows, curr_agg_fn)
                sustainable_throughputs[current_bm] = {}
                current_throughput = None
                # print(current_bm)
            
            run_match = RUN_RE.match(line)
            if run_match is not None:
                if current_run != None:
                    print(f"Did not find candidate line for {current_run}")
                current_run = (int(run_match.group(3)), int(run_match.group(4)))
                current_throughput = None
                # print(current_run)

            throughput_match = THROUGHPUT_RE.match(line)
            if throughput_match is not None:
                if current_throughput is not None:
                    print(f"Did not find run line after {current_run}")
                current_throughput = int(throughput_match.group(1))
                sustainable_throughputs[current_bm][current_run] = current_throughput
                current_run = None
                
    if current_run is not None:
        print(f"Did not find candidate line for {current_run}")
                
    return sustainable_throughputs
                
def get_all_throughputs(log_path):
    all_throughputs = defaultdict(dict)
    for log_file in sorted(os.listdir(log_path)):
        if log_file.endswith(".log"):
            print(f"Parsing {log_file}")
            sustainable_throughputs = parse_log_file(os.path.join(log_path, log_file))
#             print(f"current: {sustainable_throughputs}")
            for bm, tps in sustainable_throughputs.items():
                all_throughputs[bm].update(tps)
#             print(f"all:     {all_throughputs}\n")
    return all_throughputs

In [None]:
BASE_LOG_DIR = "/Users/law/repos/ma/benchmark-runs"

def merge_paths(paths):
    merged_tp = defaultdict(dict)
    for path in paths:
        abs_path = os.path.join(BASE_LOG_DIR, path)
        tp = get_all_throughputs(abs_path)
        for bm, tps in tp.items():
            merged_tp[bm].update(tps)
    pprint.pprint(merged_tp)
    return merged_tp

print("CONCURRENT")
CONCURRENT_PATHS = [
    "concurrent_partial",
    "concurrent_rest",
    "concurrent_tumbling",
    "concurrent_tumbling_high_bad",
]
CONCURRENT_TP = merge_paths(CONCURRENT_PATHS)
 
print("DISTRIBUTED MATRIX")
DIST_MATRIX_PATHS = [
    "matrix_dist_all"
]
DIST_MATRIX_TP = merge_paths(DIST_MATRIX_PATHS)

print("SINGLE NODE MATRIX")
SINGLE_MATRIX_PATHS = [
    "matrix_single_all",
]
SINGLE_MATRIX_TP = merge_paths(SINGLE_MATRIX_PATHS)

In [None]:
def print_throughputs(all_throughputs):
    for benchmark, run_throughputs in sorted(all_throughputs.items()):
        print(f"Benchmark {benchmark}")
        for (num_children, num_streams), throughput in sorted(run_throughputs.items()):
            print(f"Total sustainable throughput for {num_children} child(ren) with " \
                  f"{num_streams // num_children} stream(s) each " \
                  f"is {(throughput * num_streams // num_children): >7d} events/s per child.")
        print()
    

print_throughputs(CONCURRENT_TP)
print_throughputs(DIST_MATRIX_TP)
print_throughputs(SINGLE_MATRIX_TP)

# Plots

In [None]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False})
FORMATS = ["b-o", "g-x", "r-^", "c-<", "m-+", "k-*",]

### Plot Sustainable Throughput

In [None]:
def plot_throughput_group(child_streams, throughputs, title):
    formats = iter(FORMATS)
    sorted_throughputs = sorted(throughputs.items())
    # print(f"sorted_tps: {sorted_throughputs}")
    for agg_fn, tp in sorted_throughputs:
        plt.plot(tp, next(formats), label=agg_fn)
        
    plt.legend()
    str_child_streams = [str(cs) for cs in child_streams]
    plt.xticks(range(len(child_streams)), str_child_streams)
    plt.ylabel("events/s in mio.")
    plt.xlabel("#children, # streams")
    plt.title(title)
    plt.ylim(ymin=0)
    plt.show()
#     plt.savefig(f"/tmp/plots/single_child_{title[0].replace(',', '_')}_{title[1]}.png")
#     plt.close()

def group_throughput_mode(bms, num_child_streams):
    groups = defaultdict(dict)
    for benchmark, run_throughputs in bms:
        group, agg_fn = benchmark
        print(f"Adding benchmark {benchmark} to group {group}")
        bm_throughputs = []
        for num_cs in num_child_streams:
            num_streams = num_cs[1] 
            throughput = run_throughputs[num_cs]
            bm_throughputs.append((num_streams * throughput) / 1_000_000)
        groups[group][agg_fn] = bm_throughputs
    return groups
    
def group_throughput_benchmarks(dist_bms, single_bms, num_child_streams):
    groups = {}
    groups["distributed"] = group_throughput_mode(sorted(dist_bms.items()), num_child_streams)
    groups["centralized"] = group_throughput_mode(sorted(single_bms.items()), num_child_streams)
    
    print("\n\n")
    print(f"NODE_CONFIG: {num_child_streams}")
    print("\nGROUPS:")
    pprint.pprint(groups)
    return groups

In [None]:
def plot_throuput_benchmarks(dist_bms, single_bms, num_child_streams):
    groups = group_throughput_benchmarks(dist_bms, single_bms, num_child_streams)
    for mode, sub_groups in groups.items():
        for group, throughputs in sub_groups.items():
            plot_throughput_group(num_child_streams, throughputs, group)
        
def plot_multi_child_throughputs(dist_bms, single_bms):
    num_child_streams = [(1, 1), (2, 2), (4, 4), (8, 8)]
    plot_throuput_benchmarks(dist_bms, single_bms, num_child_streams)
    
def plot_single_child_throughputs(dist_bms, single_bms):
    num_child_streams = [(1, 1), (1, 2), (1, 4), (1, 8)]
    plot_throuput_benchmarks(dist_bms, single_bms, num_child_streams)

           
plot_single_child_throughputs(DIST_MATRIX_TP, SINGLE_MATRIX_TP)
plot_multi_child_throughputs(DIST_MATRIX_TP, SINGLE_MATRIX_TP)

In [None]:
def plot_concurrent_throughput_group(num_windows, throughputs, title):
    plt.plot(num_windows, throughputs)
    plt.ylabel("events/s in mio.")
    plt.xlabel("#concurrent windows")
    plt.title(title)
    plt.xscale("log")
    plt.ylim(ymin=0)
    plt.show()
    # plt.savefig(f"/tmp/plots/single_child_{title[0].replace(',', '_')}_{title[1]}.png")
    # plt.close()
    
bm_throughputs = defaultdict(list)
bm_num_windows = set()
for benchmark, run_throughputs in sorted(CONCURRENT_TP.items(), key=lambda x: int(x[0][0].split(",")[1])):
    print(f"Benchmark {benchmark}")
    num_windows = int(benchmark[0].split(",")[1])
    bm_num_windows.add(num_windows)
    agg_fn = benchmark[1]
    for throughput in run_throughputs.values(): 
        bm_throughputs[agg_fn].append(throughput / 1_000_000)

print(bm_throughputs)

for agg_fn, tps in sorted(bm_throughputs.items()):
    plot_concurrent_throughput_group(sorted(bm_num_windows)[:7], bm_throughputs[agg_fn][:7], agg_fn.replace('M_', ''))
    # plt.savefig(f"/tmp/plots/concurrent_max_tumbling.png")
    # plt.close()