In [None]:
LOG_DIR = "/Users/law/repos/ma/local_bm_runs"
LOG_FILE = f"{LOG_DIR}/network_bm_2019_10_01_1011.log"

In [None]:
import re
import tempfile
import os
import zipfile
import shutil
import subprocess
from collections import defaultdict
from pprint import pprint

BENCHMARK_RE = re.compile(r"BENCHMARK: WINDOWS: (.*) - AGG_FNS: (.*) - (DISTRIBUTED|SINGLE_NODE)")
RUN_RE = re.compile(r"Running ((.*) intermediates, )?(\d+) child.*, (\d+) stream.*")
LOGS_RE = re.compile(r"Writing logs to (.*)")
DATA_SIZE_RE = re.compile(r"Data size:\s+(\d+) bytes")

In [None]:
def analyze_file(capture_zip):
    temp_dir = tempfile.gettempdir()
    print(f"Extracting to {temp_dir}")
    with zipfile.ZipFile(capture_zip) as zf:
        zf.extractall(temp_dir)
        
    capture_file = os.path.join(temp_dir, "network_capture.pcap")
    print(f"Analyzing {capture_file}")
    analyzer = subprocess.run(("capinfos", "-csdizyxuM", capture_file), timeout=120, capture_output=True, text=True)

    print(f"Deleting {capture_file}")
    os.remove(capture_file)
    return analyzer.stdout

def analyze_capture_files(log_file):
    temp_out_file_name = f"{log_file}.temp"
    with open(temp_out_file_name, "w") as out_f:
        with open(log_file) as f:
            for line in f:
                bm_match = BENCHMARK_RE.match(line)
                if bm_match is not None:
                    out_f.write(line)

                run_match = RUN_RE.match(line)
                if run_match is not None:
                    out_f.write(line)

                logs_match = LOGS_RE.match(line)
                if logs_match is not None:
                    analysis_lines = analyze_file(os.path.join(logs_match.group(1), "network_capture.zip"))
                    out_f.write(analysis_lines)
    print(f"Moving {log_file} to {log_file}.backup")
    shutil.move(log_file, f"{log_file}.backup")
    print(f"Moving {temp_out_file_name} to {log_file}")
    shutil.move(temp_out_file_name, log_file)

In [None]:
def get_network_sizes(log_file):
    current_bm = (None, None, None)
    data_sizes = defaultdict(dict)
    with open(log_file) as f:
        for line in f:
            bm_match = BENCHMARK_RE.match(line)
            if bm_match is not None:
                current_bm = (bm_match.group(1), bm_match.group(2), bm_match.group(3))
                continue
                
            run_match = RUN_RE.match(line)
            if run_match is not None:
                current_run = (run_match.group(2), int(run_match.group(3)), int(run_match.group(4))) 
                continue
                
            data_match = DATA_SIZE_RE.match(line)
            if data_match is not None:
                data_size = int(data_match.group(1))
                data_sizes[current_bm][current_run] = data_size
                continue
            
    return data_sizes

In [None]:
NETWORK_SIZES = get_network_sizes(LOG_FILE)
pprint(NETWORK_SIZES)