In [None]:
import re
import tempfile
import os
import zipfile
import shutil
import subprocess
from collections import defaultdict
from pprint import pprint

BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*) - (DISTRIBUTED|SINGLE_NODE)")
RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
LOGS_RE = re.compile(r"Writing logs to (.*)")
DATA_SIZE_RE = re.compile(r"Data size:\s+(\d+) bytes")

In [None]:
def analyze_file(capture_zip):
    temp_dir = tempfile.gettempdir()
    print(f"Extracting to {temp_dir}")
    with zipfile.ZipFile(capture_zip) as zf:
        zf.extractall(temp_dir)
        
    capture_file = os.path.join(temp_dir, "network_capture.pcap")
    print(f"Analyzing {capture_file}")
    analyzer = subprocess.run(("capinfos", "-csdizyxuM", capture_file), timeout=120, capture_output=True, text=True)

    print(f"Deleting {capture_file}")
    os.remove(capture_file)
    return analyzer.stdout

def analyze_capture_files(log_file):
    temp_out_file_name = f"{log_file}.temp"
    with open(temp_out_file_name, "w") as out_f:
        with open(log_file) as f:
            for line in f:
                bm_match = BENCHMARK_RE.match(line)
                if bm_match is not None:
                    out_f.write(line)

                run_match = RUN_RE.match(line)
                if run_match is not None:
                    out_f.write(line)

                logs_match = LOGS_RE.match(line)
                if logs_match is not None:
                    analysis_lines = analyze_file(os.path.join(logs_match.group(1), "network_capture.zip"))
                    out_f.write(analysis_lines)
    print(f"Moving {log_file} to {log_file}.backup")
    shutil.move(log_file, f"{log_file}.backup")
    print(f"Moving {temp_out_file_name} to {log_file}")
    shutil.move(temp_out_file_name, log_file)

In [None]:
def get_network_sizes(log_file):
    current_bm = (None, None, None)
    data_sizes = defaultdict(dict)
    with open(log_file) as f:
        for line in f:
            bm_match = BENCHMARK_RE.match(line)
            if bm_match is not None:
                current_bm = (bm_match.group(1), bm_match.group(2), bm_match.group(3))
                continue
                
            run_match = RUN_RE.match(line)
            if run_match is not None:
                current_run = (run_match.group(2), int(run_match.group(3)), int(run_match.group(4))) 
                continue
                
            data_match = DATA_SIZE_RE.match(line)
            if data_match is not None:
                data_size = int(data_match.group(1))
                data_sizes[current_bm][current_run] = data_size
                continue
            
    return data_sizes

In [None]:
def get_all_network_sizes(log_files):
    network_sizes = defaultdict(dict)
    for log_file in log_files:
        print(f"Analyzing {log_file}")
        sizes = get_network_sizes(log_file)
        for bm, runs in sizes.items():
            network_sizes[bm].update(runs)
            
    return network_sizes

LOG_DIR = "/Users/law/repos/ma/local_bm_runs"
LOG_FILE_1 = f"{LOG_DIR}/network_bm_2019_10_01_1011.log"
LOG_FILE_2 = f"{LOG_DIR}/network_bm_2019_10_01_1255.log"
LOG_FILE_3 = f"{LOG_DIR}/network_bm_2019_10_03_1355.log"
NETWORK_SIZES = get_all_network_sizes([LOG_FILE_1, LOG_FILE_2, LOG_FILE_3])
pprint(NETWORK_SIZES)

In [None]:
ALL_SIZES = []

for setup, runs in NETWORK_SIZES.items():
    for node_config, network_size in runs.items():
        print(f"{setup}, {node_config}")
        size_in_mb = int(network_size / (1024 * 1024))
        print(f"Total bytes sent through network: {size_in_mb} MB.")
        ALL_SIZES.append((size_in_mb, setup, node_config))
        
ALL_SIZES.sort(reverse=True)
print(ALL_SIZES[0:10])

In [None]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
rcParams.update({'figure.autolayout': True, 'pgf.rcfonts' : False, 'font.size': 14})
plt.style.use('seaborn-deep')

In [None]:
def plot_network(sizes, bm):
    fig, ax = plt.subplots()
    fig.set_tight_layout(False)
    
    font_size = 16
    plt.rc('font', family='serif', serif='Times')
    plt.rc('xtick', labelsize=font_size)
    plt.rc('ytick', labelsize=font_size)
    plt.rc('axes', labelsize=font_size)
    plt.rc('figure', autolayout=False)
    plt.rc('font', size=font_size)
    plt.rc('lines', linewidth=4)
    plt.rc('lines', markersize=8)
    plt.rc('lines', markeredgewidth=2)
    plt.style.use('seaborn-deep')
    plt.rc('figure', figsize=(4.5, 3))

    
    bar_width = 0.40
    assert len(sizes) % 2 == 0
    x_locations = list(range(len(sizes) // 2))
    
    clean_sizes = [(s[0], s[1][2], s[2]) for s in sizes]
    single_sizes = sorted([s for s in clean_sizes if s[1] == "SINGLE_NODE"], key=lambda s: s[2])
    dist_sizes = sorted([s for s in clean_sizes if s[1] == "DISTRIBUTED"], key=lambda s: s[2])
    print(single_sizes, dist_sizes)
    assert len(single_sizes) == len(dist_sizes)
    node_configs = [s[2] for s in single_sizes]
    
    single_sizes_only = [s[0] / 1024 * 10 for s in single_sizes]
    dist_sizes_only = [s[0] / 1024 * 10 for s in dist_sizes]
    print(single_sizes_only, dist_sizes_only)
    
    ax.bar(x_locations, dist_sizes_only, bar_width, bottom=0, label="distributed", hatch="//")

    single_x_locations = [x + (bar_width) for x in x_locations]
    ax.bar(single_x_locations, single_sizes_only, bar_width, bottom=0, label="centralized", hatch="\\\\")

#     ax.set_title(bm)
    ax.set_ylabel("# bytes sent in GB")
    ax.set_xlabel("height of network tree")
    ax.set_xticks([x + (bar_width / 2) for x in x_locations])
    ax.set_xticklabels([str(nc).count("1") for nc in node_configs])
    ax.set_yticks(range(0, 14))
    ax.set_yticklabels(("0", "", "2", "", "4", "", "6", "", "8", "", "10", "", "12"))
    ax.set_ylim(ymin=0, ymax=13)
#     ax.set_xticklabels(node_configs)
#     plt.legend()
    plt.savefig(f"/tmp/paper_plots/network_{bm}.pdf", bbox_inches="tight", pad_inches=0)
    plt.show()

chain_config = [('0', 1, 1), ('1', 1, 1), ('1-1', 1, 1), ('1-1-1', 1, 1)]
num_child_config = [('0', 1, 1), ('0', 2, 2), ('0', 4, 4), ('0', 8, 8)]
multi_level_config = [('2', 4, 4), ('3', 6, 6), ('4', 8, 8), ('2-4', 8, 8), ('4-8', 16, 16)]

node_configs = [
    chain_config, 
#     num_child_config,
#     multi_level_config
]

bms = set()
for size in ALL_SIZES:
    bms.add(size[1][0])

# for bm in bms:
for agg_fn in ["MAX", "M_AVG", "M_MEDIAN"]:
    for node_config in node_configs:
        bm_sizes = [size for size in ALL_SIZES if size[1][0] == "TUMBLING,1000" \
                                                and size[1][1] == agg_fn \
                                                and size[2] in node_config]
#         plot_network(bm_sizes, f"{bm} : {agg_fn}")
        if agg_fn != "M_MEDIAN": continue 
        plot_network(bm_sizes, f"{agg_fn}")