In [2]:
# Generate the job scripts to be run on the cluster.

script = """#!/bin/bash

#exit the job if there is something wrong with /scratch
cd /scratch || exit
mkdir -p {solisid} || exit
cd {solisid} || exit

echo "Copying model to /scratch..."

#recursively copy all the project files from homedir to node /scratch/solisid
rsync -r --info=progress2 /nethome/{solisid}/src .
cd src/TransmissionModel

echo "Installing requirements"
pip install -r requirements.txt


echo "Running models..."
for i in {{{loop_start}..{loop_end}}} ; do
  python3 __MainT__.py $i &
done

wait

echo "Completed models, copying results to /nethome/solisid..."

cd ..
cd Data
rsync -r --info=progress2 ./Model_V1/Data/High /nethome/{solisid}/results/{demo}

echo "Done..."
#end of job"""

# Generate jobs in batches of 16
for job in range(0, 24):
    job_start = job * 16
    job_end = min(job_start + 15, 380)
    job_script = script.format(loop_start=job_start, loop_end=job_end, demo=5, solisid=6958680)
    with open(f"jobs/job{job}.sh", "w") as file:
        file.write(job_script)


In [1]:
# Check whether all results were correctly generated. 
# Prints which jobs are missing files (so they can be rerun)
# Missing files might happen due to memory issues on the cluster sometimes, or node reboots.

import os

demo = 1

path = os.getcwd() + f'/results/{demo}/High/'

jobs_to_rerun = set()
for seed in range(5):
    path_seed = path + f'Seed_{seed}/'
    for loc in range(380):
        path_loc = path_seed + f'Runs_ref_4.6_100_5_risk{loc}_{demo}/'
        job = int(loc/16)
        if not os.path.isdir(path_loc):
            print(f'Seed: {seed} | Loc: {loc} | Job: {job}')
            jobs_to_rerun.add(job)
        else:
            for run in range(5):
                path_run = path_loc + f'Status_{run}.npz'
                if not os.path.isfile(path_run):
                  print(f'Seed: {seed} | Loc: {loc} | Job: {job} | Run: {run}')  
                  jobs_to_rerun.add(job)

print(jobs_to_rerun)

set()
