In [40]:
import uuid
import mlflow
import subprocess
import pandas as pd
import os
from loguru import logger

def get_git_commit():
    try:
        return (
            subprocess.check_output(["git", "rev-parse", "HEAD"])
            .strip()
            .decode("utf-8")
        )
    except subprocess.CalledProcessError:
        return "unknown"



#REMOTE_PATH = "~/projects/cuda-spatial-filtering/build"
REMOTE_HOST = "nt"

logger.info("Setting up MlFlow")
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("2025-beamforming-cuda-optimization-test")


job_id = str(uuid.uuid4())
logger.info(f"job_id: {job_id}")

LOCAL_OUTPUT_DIR = job_id
subprocess.run(["mkdir", job_id])

git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
SLURM_FILE_NAME = "submit_job.sh"

#benchmark_dataset = "~/projects/output_5000.pcap"
ncu_dataset = "~/projects/output_10.pcap"
benchmark_file_name = f"benchmarks_{job_id}.json"

run_description = ""


params = {}

params['NR_CHANNELS'] = 16
params['NR_RECEIVERS'] = 32
params['NR_BITS'] = 8
params['NR_POLARIZATIONS'] = 2
params['NR_PACKETS_FOR_CORRELATION'] = 16
params['NR_TIME_STEPS_PER_PACKET'] = 64
params['NR_RECEIVERS_PER_BLOCK'] = 32
params['NR_BUFFERS'] = 2
params['NR_CORRELATIONS_BLOCKS_TO_INTEGRATE'] = 10
params['NR_PACKETS_TOTAL'] = 500

params['GENERATED_INPUT_FILE_NAME'] = f"input_{job_id}.pcap"
params['REMOTE_PATH'] = '/home/jsmallwo/projects/cuda-spatial-filtering/build'
params['REMOTE_EXEC'] = f"./beamform_spatial {params['REMOTE_PATH']}/{job_id}/{params['GENERATED_INPUT_FILE_NAME']} {benchmark_file_name}" 
params['REMOTE_EXEC_NCU'] = f"./beamform_spatial {ncu_dataset} {benchmark_file_name}.ncu" 
params['PROFILE_OUTPUT'] = f"profile_output_{job_id}"
params['NSYS_PROFILE_OUTPUT'] = f"nsys_profile_output_{job_id}"
params['JOB_OUTPUT_FILE_NAME'] = f"output_{job_id}.txt"

logger.info(f"PARAMS:\n")
logger.info(params)

logger.info("Creating input dataset....")
cmd = (
    'source .venv/bin/activate && '
    f'python create_pcap.py --output {job_id}/{params["GENERATED_INPUT_FILE_NAME"]} '
    f'--number_receivers {params["NR_RECEIVERS"]} '
    f'--number_packets {params["NR_PACKETS_TOTAL"]} '
    f'--number_channels {params["NR_CHANNELS"]}'
)
subprocess.run(cmd, shell=True, check=True)


subprocess.run(['rsync', '-avz', os.path.join(job_id, params["GENERATED_INPUT_FILE_NAME"]), f"{REMOTE_HOST}:{REMOTE_PATH}/{job_id}/"])

logger.info("Creating slurm script")
slurm_script = f"""
#!/bin/bash
#
#SBATCH --job-name=profile
#SBATCH --output={params["JOB_OUTPUT_FILE_NAME"]}
#
#SBATCH --ntasks=1
#SBATCH --time=10:00
#SBATCH --mem=4g
#SBATCH --gres=gpu:1

srun apptainer exec --nv /fred/oz002/jsmallwo/apptainer.sif /bin/bash -c "cd {params["REMOTE_PATH"]} && \\
    cmake -DBUILD_TESTING=OFF -DBENCHMARKING=1 -DBUILD_TARGET=LAMBDA -DNR_CHANNELS={params["NR_CHANNELS"]} -DNR_RECEIVERS={params["NR_RECEIVERS"]} .. && cmake --build . && \\
    cd apps && {params['REMOTE_EXEC']}  &&
    ncu -f --set full --target-processes all --export {params["PROFILE_OUTPUT"]} {params["REMOTE_EXEC_NCU"]} && \\
    ncu --import {params["PROFILE_OUTPUT"]}.ncu-rep --csv --page details > {params["PROFILE_OUTPUT"]}.csv && \\
    nsys profile -t cuda,nvtx -o {params["NSYS_PROFILE_OUTPUT"]} --stats=true --force-overwrite true {params["REMOTE_EXEC"]}"
"""
logger.info("Writing to file")
# Write to file
with open("submit_job.sh", "w") as f:
    f.write(slurm_script)

logger.info("Slurm script written to 'submit_job.sh'")


# === Step 6: Sync slurm script to server ===
logger.info("Syncing slurm script to remote...")
subprocess.run(["rsync", "-avz", SLURM_FILE_NAME, f"{REMOTE_HOST}:{params['REMOTE_PATH']}"])


# === Step 7: Compile and profile remotely ===
logger.info("Submitting slurm job...")
subprocess.run(
    f'ssh {REMOTE_HOST} -t "cd {params["REMOTE_PATH"]} && sbatch -W submit_job.sh"',
    shell=True,
)

logger.info("Pulling back results...")
os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)
subprocess.run(
    [
        "rsync",
        "-avz",
        f"{REMOTE_HOST}:{params['REMOTE_PATH']}/apps/{params['PROFILE_OUTPUT']}.csv",
        LOCAL_OUTPUT_DIR,
    ]
)

subprocess.run(
    [
        "rsync",
        "-avz",
        f"{REMOTE_HOST}:{params['REMOTE_PATH']}/apps/{params['PROFILE_OUTPUT']}.ncu-rep",
        LOCAL_OUTPUT_DIR,
    ]
)

subprocess.run(
    [
        "rsync",
        "-avz",
        f"{REMOTE_HOST}:{params['REMOTE_PATH']}/apps/{params['NSYS_PROFILE_OUTPUT']}.nsys-rep",
        LOCAL_OUTPUT_DIR,
    ]
)

subprocess.run(
    [
        "rsync",
        "-avz",
        f"{REMOTE_HOST}:{params['REMOTE_PATH']}/{params['JOB_OUTPUT_FILE_NAME']}",
        LOCAL_OUTPUT_DIR,
    ]
)

subprocess.run(
    [
        "rsync",
        "-avz",
        f"{REMOTE_HOST}:{params['REMOTE_PATH']}/apps/{benchmark_file_name}",
        LOCAL_OUTPUT_DIR,
    ]
)


local_rep_path = os.path.join(LOCAL_OUTPUT_DIR, f"{params['PROFILE_OUTPUT']}.csv")


logger.info("Starting MLFlow run...")
description = f"""
**Description**
{run_description}

"""

with mlflow.start_run(description=description) as run:
    profile_path = f"{os.path.join(LOCAL_OUTPUT_DIR, params['PROFILE_OUTPUT'])}.csv"
    data = pd.read_csv(profile_path)
    #model_params = extract_parameters_from_csv(data)
    logger.info("Logging parameters...")
    #mlflow.log_params(model_params)
    mlflow.log_param("git_commit_hash", get_git_commit())
    mlflow.log_params(params)
    #mlflow.log_param("benchmark_dataset", benchmark_dataset)
    
    
    logger.info("Logging metrics...")
    #metrics = extract_metrics_from_csv(data)
    #mlflow.log_metrics(metrics)
    with open(os.path.join(LOCAL_OUTPUT_DIR,benchmark_file_name), 'r') as f:
        timings = json.load(f)
        timings['beamforming_duration_us'] = timings['checkpoint_end_beamforming'] - timings['checkpoint_begin_beamforming']
        mlflow.log_metrics(timings) 

    mlflow.log_artifact(profile_path)
    # log the original ncu-rep file as well.
    mlflow.log_artifact(profile_path.replace(".csv", ".ncu-rep"))
    mlflow.log_artifact(os.path.join(LOCAL_OUTPUT_DIR,f"{params['NSYS_PROFILE_OUTPUT']}.nsys-rep"))
    mlflow.log_artifact(os.path.join(LOCAL_OUTPUT_DIR,benchmark_file_name))
    mlflow.log_artifact(os.path.join(LOCAL_OUTPUT_DIR, params["JOB_OUTPUT_FILE_NAME"]))

    logger.info(f"✅ MLflow run completed: {run.info.run_id}")

logger.info("Cleaning up....")
subprocess.run(
f"ssh {REMOTE_HOST} rm -r {params['REMOTE_PATH']}/{job_id}/", shell=True
)
subprocess.run(f"rm -r {job_id}", shell=True)




[32m2025-07-25 16:50:07.673[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSetting up MlFlow[0m
[32m2025-07-25 16:50:07.741[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mjob_id: f520387e-af43-4b52-8bee-56c207d0df5e[0m
[32m2025-07-25 16:50:07.786[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m65[0m - [1mPARAMS:
[0m
[32m2025-07-25 16:50:07.787[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m66[0m - [1m{'NR_CHANNELS': 16, 'NR_RECEIVERS': 32, 'NR_BITS': 8, 'NR_POLARIZATIONS': 2, 'NR_PACKETS_FOR_CORRELATION': 16, 'NR_TIME_STEPS_PER_PACKET': 64, 'NR_RECEIVERS_PER_BLOCK': 32, 'NR_BUFFERS': 2, 'NR_CORRELATIONS_BLOCKS_TO_INTEGRATE': 10, 'NR_PACKETS_TOTAL': 500, 'GENERATED_INPUT_FILE_NAME': 'input_f520387e-af43-4b52-8bee-56c207d0df5e.pcap', 'REMOTE_PATH': '/home/jsmallwo/projects/cuda-spatial-filtering/build', 'REMOTE_EXEC': './beamform_spatial /home/jsmallwo/projects/cuda-spatial-fil

✅ PCAP file 'f520387e-af43-4b52-8bee-56c207d0df5e/input_f520387e-af43-4b52-8bee-56c207d0df5e.pcap' written.
building file list ... done
created directory /home/jsmallwo/projects/cuda-spatial-filtering/build/f520387e-af43-4b52-8bee-56c207d0df5e
input_f520387e-af43-4b52-8bee-56c207d0df5e.pcap


[32m2025-07-25 16:51:48.689[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m81[0m - [1mCreating slurm script[0m
[32m2025-07-25 16:51:48.690[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m99[0m - [1mWriting to file[0m
[32m2025-07-25 16:51:48.696[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m104[0m - [1mSlurm script written to 'submit_job.sh'[0m
[32m2025-07-25 16:51:48.697[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m108[0m - [1mSyncing slurm script to remote...[0m



sent 15681575 bytes  received 42 bytes  2090882.27 bytes/sec
total size is 33728024  speedup is 2.15


[32m2025-07-25 16:51:51.104[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m113[0m - [1mSubmitting slurm job...[0m


building file list ... done
submit_job.sh

sent 628 bytes  received 54 bytes  194.86 bytes/sec
total size is 1393  speedup is 2.04


Pseudo-terminal will not be allocated because stdin is not a terminal.


Submitted batch job 2628206


[32m2025-07-25 16:54:12.870[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m119[0m - [1mPulling back results...[0m


receiving file list ... done
profile_output_f520387e-af43-4b52-8bee-56c207d0df5e.csv

sent 38 bytes  received 24119 bytes  6902.00 bytes/sec
total size is 345185  speedup is 14.29
receiving file list ... done
profile_output_f520387e-af43-4b52-8bee-56c207d0df5e.ncu-rep

sent 38 bytes  received 2373810 bytes  678242.29 bytes/sec
total size is 19186476  speedup is 8.08
receiving file list ... done
nsys_profile_output_f520387e-af43-4b52-8bee-56c207d0df5e.nsys-rep

sent 38 bytes  received 432808 bytes  173138.40 bytes/sec
total size is 935920  speedup is 2.16
receiving file list ... done
output_f520387e-af43-4b52-8bee-56c207d0df5e.txt

sent 38 bytes  received 30842 bytes  8822.86 bytes/sec
total size is 2080784  speedup is 67.38


[32m2025-07-25 16:54:26.622[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m170[0m - [1mStarting MLFlow run...[0m
[32m2025-07-25 16:54:26.723[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m181[0m - [1mLogging parameters...[0m


receiving file list ... done
benchmarks_f520387e-af43-4b52-8bee-56c207d0df5e.json

sent 38 bytes  received 308 bytes  98.86 bytes/sec
total size is 216  speedup is 0.62


[32m2025-07-25 16:54:26.789[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m188[0m - [1mLogging metrics...[0m
[32m2025-07-25 16:54:26.960[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m203[0m - [1m✅ MLflow run completed: 6e3f7aeb2bd641d4a6634c2013c68046[0m
[32m2025-07-25 16:54:26.971[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m205[0m - [1mCleaning up....[0m


🏃 View run nosy-colt-73 at: http://localhost:5000/#/experiments/242493091591921745/runs/6e3f7aeb2bd641d4a6634c2013c68046
🧪 View experiment at: http://localhost:5000/#/experiments/242493091591921745


CompletedProcess(args='rm -r f520387e-af43-4b52-8bee-56c207d0df5e', returncode=0)

In [38]:
subprocess.run(
f"ssh {REMOTE_HOST} rm -r {REMOTE_PATH}/{job_id}/", shell=True
)
subprocess.run(f"rm -r {job_id}", shell=True)

rm: cannot remove '/Users/jsmallwood/projects/cuda-spatial-filtering/build/b76287e5-2265-4061-8e53-f0b9d0eb4363/': No such file or directory
rm: b76287e5-2265-4061-8e53-f0b9d0eb4363: No such file or directory


CompletedProcess(args='rm -r b76287e5-2265-4061-8e53-f0b9d0eb4363', returncode=1)