In [None]:
import datetime
import gzip
import json
import time
import os

import dask
from dask.distributed import Client
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import tqdm.notebook

client = Client("tls://localhost:8786")

In [None]:
def get_input(max_size_GB = None):
    with gzip.open("file_metadata_v2.json.gz") as f:
        dataset_info = json.loads(f.read().decode())

    all_files = []
    all_sizes_GB = []
    for containers_for_category in dataset_info.values():
        for container, metadata in containers_for_category.items():
            if metadata["files_output"] is None:
                continue
            for fname, size in zip(metadata["files_output"], metadata["sizes_output_GB"]):
                all_files.append(fname)
                all_sizes_GB.append(size)
                if max_size_GB and sum(all_sizes_GB) > max_size_GB:
                    return all_files, all_sizes_GB
    return all_files, all_sizes_GB

all_files, all_sizes_GB = get_input(max_size_GB = None)  # limit list to specific total size
print(f"list of {len(all_files)} files with total size {sum(all_sizes_GB):.2f} GB")

In [None]:
# if no filesize metadata exist, parse it from xrdcp
# ##################################################

# import pexpect

# def run_xrdcp(fname):
#     t0 = time.time()
#     child = pexpect.spawn(f"xrdcp {fname} /dev/null -f")
#     child.expect(pexpect.EOF, timeout=600)
#     t1 = time.time()
#     res = child.before.decode()
#     size = res.split("\r")[-2].split("/")[0][1:]
#     if "MB" in size:
#         size_in_GB = float(size[:-2]) * 1024**2 / 1000**3
#     elif "GB" in size:
#         size_in_GB = float(size[:-2]) * (1024/1000)**3
#     elif "kB" in size:
#         size_in_GB = float(size[:-2]) * 1024 / 1000**3
#     else:
#         raise ValueError(f"cannot handle size: {size}")
#     return {"t0": t0, "t1": t1, "GBread": size_in_GB}

# t0 = time.time()
# tasks = [dask.delayed(run_xrdcp)(fname) for fname in all_files]
# futures = client.compute(tasks)

# with tqdm.notebook.tqdm(total=len(futures)) as pbar:
#   for future in dask.distributed.as_completed(futures):
#     pbar.update(1)

# res = [f.result() for f in futures]
# t1 = time.time()
# all_sizes_GB_from_xrdcp = [r["GBread"] for r in res]



# if filesize metadata is available, this is simpler
# ##################################################

def run_xrdcp(fname, size):
    t0 = time.time()
    os.system(f"xrdcp {fname} /dev/null -f")
    t1 = time.time()
    return {"t0": t0, "t1": t1, "GBread": size}

t0 = time.time()
tasks = [dask.delayed(run_xrdcp)(fname, size) for fname, size in zip(all_files, all_sizes_GB)]
futures = client.compute(tasks)

with tqdm.notebook.tqdm(total=len(futures)) as pbar:
  for future in dask.distributed.as_completed(futures):
    pbar.update(1)

res = [f.result() for f in futures]
t1 = time.time()

track egress: [link](https://grafana.mwt2.org/d/EKefjM-Sz/af-network-200gbps-challenge?orgId=1&from=now-1h&to=now&viewPanel=panel-205&refresh=5s)

In [None]:
total_runtime_sum = sum(r["t1"] - r["t0"] for r in res)

print(f"processtime: {total_runtime_sum:.2f} s")
print(f" -> data rate per worker: {sum(all_sizes_GB) * 8 / total_runtime_sum:.2f} Gbps")

print(f"walltime: {t1-t0:.2f} s")
print(f" -> total data rate: {sum(all_sizes_GB) * 8 / (t1-t0):.2f} Gbps")

starts = np.asarray([r["t0"] for r in res])
ends = np.asarray([r["t1"] for r in res])
GBread = [r["GBread"] for r in res]
rates_per_chunk = GBread / (ends - starts)

t_samples = np.linspace(t0, t1, 100)
rate_samples = []
for t in t_samples:
    mask = np.logical_and((starts <= t), (t < ends))
    rate_samples.append(float(sum(rates_per_chunk[mask]) * 8))

print(f"total data read from data rate integral: {sum((t_samples[1] - t_samples[0]) * np.asarray(rate_samples)) / 8:.2f} GB")
t_samples = [datetime.datetime.fromtimestamp(t) for t in t_samples.tolist()]

fig, ax = plt.subplots(constrained_layout=True)
ax.plot(t_samples, rate_samples, marker="v", linewidth=0)
ax.set_xlabel("time")
ax.tick_params(axis="x", labelrotation=45)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
ax.set_ylabel("data rate [Gbps]")
ax.set_ylim([0, ax.get_ylim()[1] * 1.1])
fig.savefig("xrdcp_rate.png")