In [49]:
LOGS_DIR = "/depot/cms/sonic/dkondra/saturation_logs"
MEAS_DIR = "/depot/cms/sonic/dkondra/saturation_measurements/1xT4"
# MEAS_DIR = "/depot/cms/sonic/dkondra/saturation_measurements/T4"
# MEAS_DIR = "/depot/cms/sonic/dkondra/saturation_measurements/A100"


In [50]:
def get_throughput(job_id, start=100, end=900):
    import sys
    log_file = f"{LOGS_DIR}/job_{job_id}.err"

    file = open(log_file, 'r')
    lines = file.readlines()
    
    start_ok = end_ok = False

    for line in lines:
        if f"Begin processing the {start}th record" in line:
            start_time = str.split(line)[-2]
            start_ok = True
        if f"Begin processing the {end}th record" in line:
            end_time = str.split(line)[-2]
            end_ok = True
            continue

    if not start_ok:
        raise Exception(f"No entry for {start}th event. Maybe the job is still running?")
    if not end_ok:
        raise Exception(f"No entry for {end}th event. Maybe the job is still running?")

    def timediff(str1, str2, nevts):
        splot = str.split(str1, ":")
        splot2 = str.split(str2, ":")
        if((float(splot[0]) - float(splot2[0])) >= 0.):
            hrdiff = (float(splot[0]) - float(splot2[0]))*3600.
        else:
            hrdiff = (24 + float(splot[0]) - float(splot2[0]))*3600.
        mindiff = (float(splot[1]) - float(splot2[1]))*60.
        secdiff = (float(splot[2]) - float(splot2[2]))
        theDiff = hrdiff+mindiff+secdiff
        return nevts/theDiff

    file.close()

    return timediff(end_time, start_time, end-start)

In [42]:
import subprocess
import os

def cancel_job(job_d):
    subprocess.run(f"scancel {job_id}", shell=True)

def cancel_jobs():
    username = os.environ.get("USER")
    subprocess.run(f"scancel -u {username}", shell=True)

def submit_slurm_job():
    result = subprocess.run("sbatch slurm-script.sh", shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    job_id = int(result.stdout.decode("utf-8").split(" ")[-1])
    return job_id

def get_n_slurm_jobs():
    result = subprocess.run("squeue -u dkondra | wc -l", shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    njobs = int(result.stdout.decode("utf-8")) - 1
    return njobs

def submit_n_jobs(n):
    job_ids = []
    for i in range(n):
        job_ids.append(submit_slurm_job())
    return job_ids


def delete_logs(job_id):
    log_files = [
        f"{LOGS_DIR}/job_{job_id}.err",
        f"{LOGS_DIR}/job_{job_id}.out"
    ]
    for lf in log_files:
        subprocess.run(f"rm {lf}", shell=True)

def save_measurements(throughputs):
    n_jobs = len(throughputs)
    out_file = f"{MEAS_DIR}/{n_jobs}.txt"
    with open(out_file, 'w') as file:
        for value in throughputs.values():
            file.write(str(value) + '\n')

In [47]:
# cancel_jobs()
n = 20
job_ids = submit_n_jobs(n)

In [48]:
import time

throughputs = {job_id: 0 for job_id in job_ids}
done = 0
    
while True:
    all_ready = True
    for job_id in job_ids:
        # If throughput for this job already calculated
        if throughputs[job_id] > 0:
            continue

        # Try to extract througput
        try:
            tp_value = get_throughput(job_id)
            throughputs[job_id] = tp_value
            done += 1
            # delete_logs(job_id)
        except Exception:
            # print(job_id)
            all_ready = False
            continue

    # Exit loop when all throughputs are extracted
    if all_ready:
        print("Done!")
        save_measurements(throughputs)
        for job_id in job_ids:
            cancel_job(job_id)
        # cancel_jobs()
        # subprocess.run(f"rm {LOGS_DIR}/*", shell=True)
        # print(throughputs)
        break

    njobs = get_n_slurm_jobs()
    print(f"{njobs} jobs still running. Done: {done}/{len(job_ids)}")
    # If we are still in the loop - wait
    time.sleep(60)


40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
40 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 0/20
41 jobs still running. Done: 5/20
41 jobs still running. Done: 5/20
41 jobs still running. Done: 5/20
41 jobs still running. Done: 9/20
41 jobs still running. Done: 19/20
41 jobs still running. Done: 19/20
41 jobs still running. Done: 19/20
Done!


In [44]:
cancel_jobs()
subprocess.run(f"rm {LOGS_DIR}/*", shell=True)

CompletedProcess(args='rm /depot/cms/sonic/dkondra/saturation_logs/*', returncode=0)

In [None]:
for key, value in throughputs.items():
    print(key,value)