# Maestro Tutorial:

In [14]:
import os, json

## 1) Preparation:

First, include the database URL into your environment.

In [15]:
os.environ['DATABASE_SERVER_URL']="postgresql://postgres:postgres@postgres-server.lps.ufrj.br:5432/joao.pinto"

than, install the maestro package into your python. This must be install OUTSIDE of your image. 

In [16]:
#!pip install git+https://github.com/jodafons/maestro.git

In [17]:
!maestro

[1;32mUsage:[0m [1;38;5;244mmaestro[0m [[32m-h[0m] [32m{init,task,run,slurm}[0m ...

[1;32mPositional Arguments:[0m
  [32m{init,task,run,slurm}[0m

[1;32mOptional Arguments:[0m
  [32m-h[0m, [32m--help[0m            [38;5;244mshow this help message and exit[0m
None


## 2) Start the service:

In [18]:
!maestro run slurm -h

[1;32mUsage:[0m [1;38;5;244mmaestro slurm[0m [[32m-h[0m] [[32m--device[0m [34mDEVICE[0m] [[32m--partition[0m [34mPARTITION[0m]
                     [32m--max-procs[0m [34mMAX_PROCS[0m [[32m--message-level[0m [34mMESSAGE_LEVEL[0m]
                     [[32m--master-port[0m [34mMASTER_PORT[0m] [[32m--disable-resources-policy[0m]
                     [[32m--runner-port[0m [34mRUNNER_PORT[0m]
                     [[32m--tracking-port[0m [34mTRACKING_PORT[0m]
                     [[32m--tracking-location[0m [34mTRACKING_LOCATION[0m]
                     [[32m--tracking-enable[0m]
                     [[32m--tracking-email-from[0m [34mTRACKING_EMAIL_FROM[0m]
                     [[32m--tracking-email-password[0m [34mTRACKING_EMAIL_PASSWORD[0m]
                     [[32m--database-url[0m [34mDATABASE_URL[0m] [[32m--database-recreate[0m]
                     [[32m--slurm-reservation[0m [34mSLURM_RESERVATION[0m] [32m--slurm-partition

In [19]:
!maestro run slurm --device auto\
                      --message-level INFO\
                      --max-procs 4\
                      --slurm-partition gpu-large\
                      --database-recreate \
                      --database-url ${DATABASE_SERVER_URL} \
                      --slurm-nodes 1

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --exclusive
#SBATCH --cpus-per-task=8
#SBATCH --account=joao.pinto
#SBATCH --partition=gpu-large
#SBATCH --job-name=maestro-master
#SBATCH --output=maestro-master.job_%j.out
export LOGURO_LEVEL='INFO'
echo Node: $SLURM_JOB_NODELIST
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
echo OMP_NUM_HTREADS: $SLURM_CPUS_PER_TASK
maestro run master --device=auto --partition=gpu-large --max-procs=4 --message-level=INFO --master-port=5000 --runner-port=6000 --tracking-port=4000 --tracking-location=/home/joao.pinto/TESTE_DO_MAESTRO/tracking --tracking-email-from= --tracking-email-password= --database-url=postgresql://postgres:postgres@postgres-server.lps.ufrj.br:5432/joao.pinto --database-recreate
wait

Submitted batch job 2321


## 3) List services:

In [21]:
!squeue

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              2202 cpu-large  jupyter marina.j PD       0:00      1 (Priority)
              2128 cpu-large  jupyter leandro. PD       0:00      1 (Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions)
              2313       gpu interact otto.tav  R    1:05:32      1 caloba73
              2306       gpu interact regina.a  R    1:30:42      1 caloba72
              2305       gpu interact regina.a  R    1:33:07      1 caloba78
            2301_7       gpu cycle_nt otto.tav  R    1:39:39      1 caloba79
            2301_8       gpu cycle_nt otto.tav  R    1:39:39      1 caloba70
            2304_2       gpu cycle_nt otto.tav  R    1:35:53      1 caloba71
              2307 gpu-large interact joao.pin  R    1:16:55      1 caloba93
              2232 gpu-large mc23c_tr luiz.fil  R    2:02:32      1 caloba91
              2297 gpu-large  jupyter sarita.r  R    2:03:

## 4) Prepare my Jobs:

In [22]:
import os, json
number_of_jobs = 100
basepath = os.getcwd()
os.makedirs(basepath+'/jobs', exist_ok=True)

for sort in range(number_of_jobs):
    job = {
        'sort'            : sort,
        'seed'            : 512*(sort+1),
    }
    o = basepath + '/jobs/job.sort_%d.json'%(sort)
    with open(o, 'w') as f:
        json.dump(job, f)

In [23]:
program = """

import argparse
import os, sys, json
from time import sleep


def main():
    # Training settings
    parser = argparse.ArgumentParser(description = '', add_help = False)
    parser = argparse.ArgumentParser()

    parser.add_argument('-j','--job', action='store',
            dest='job', required = True,
                help = "The job config file.")

    if len(sys.argv)==1:
      parser.print_help()
      sys.exit(1)
    
    args = parser.parse_args()

    print('Starting job...')

    # getting parameters from the job configuration
    job             = json.load(open(args.job, 'r'))
    seed            = job['seed']
   
    # getting parameters from the server
    device       = int(os.environ['CUDA_VISIBLE_DEVICES'])
    workarea     = os.environ['JOB_WORKAREA']
    job_id       = os.environ['JOB_ID']
    run_id       = os.environ['TRACKING_RUN_ID']
    tracking_url = os.environ['TRACKING_URL']
    dry_run      = os.environ['JOB_DRY_RUN'] == 'true'
    print(run_id)
    print("dry run? " + "Yes" if dry_run else "No")

    sleep(5)

    print('Finish job...')
    sys.exit(0)


if __name__ == '__main__':
    main()

"""

with open("program.py",'w') as f:
    f.write(program)

## 5) Create my task:

In [24]:
!maestro task create -h

[1;32mUsage:[0m [1;38;5;244mmaestro create[0m [[32m-h[0m] [32m-t[0m [34mTASKNAME[0m [32m-i[0m [34mINPUTFILE[0m [[32m--image[0m [34mIMAGE[0m]
                      [[32m--virtualenv[0m [34mVIRTUALENV[0m] [32m--exec[0m [34mCOMMAND[0m [[32m--dry_run[0m]
                      [[32m--binds[0m [34mBINDS[0m] [32m-p[0m [34mPARTITION[0m [[32m--contact_to[0m [34mCONTACT_TO[0m]
                      [[32m--parents[0m [34mPARENTS[0m] [[32m--envs[0m [34mENVS[0m] [[32m--priority[0m [34mPRIORITY[0m]
                      [[32m--test_bypass[0m] [[32m--database-url[0m [34mDATABASE_URL[0m]

[1;32mOptional Arguments:[0m
  [32m-h[0m, [32m--help[0m            [38;5;244mshow this help message and exit[0m
  [32m-t[0m, [32m--task[0m [34mTASKNAME[0m   [38;5;244mThe name of the task to be included into the maestro.[0m
  [32m-i[0m, [32m--inputfile[0m [34mINPUTFILE[0m
                        [38;5;244mThe input config file that will 

In [27]:
image="/mnt/cern_data/joao.pinto/images/torch_base_latest.sif"
command = "maestro task create -i {BASE}/jobs -t user.test_2 --exec 'python {BASE}/program.py -j %IN' -p gpu-large --image {IMAGE}"
os.system(command.format(BASE=os.getcwd(), IMAGE=image))


Creating... : 100it [00:00, 48410.71it/s]


[32m2024-05-10T12:07:57.440399-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mJob will use /mnt/cern_data/joao.pinto/images/torch_base_latest.sif as image...[0m
[32m2024-05-10T12:07:57.440492-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mSetting all environs into the singularity envs...[0m
[32m2024-05-10T12:07:57.440525-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mrunning job using singularity engine... /mnt/cern_data/joao.pinto/images/torch_base_latest.sif[0m
[32m2024-05-10T12:07:58.444739-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mcd /home/joao.pinto/TESTE_DO_MAESTRO/user.test_2/job.sort_27
[0m
[32m2024-05-10T12:07:58.444864-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mpython /home/joao.pinto/TESTE_DO_MAESTRO/program.py -j /home/joao.pinto/TESTE_DO_MAESTRO/jobs/job.sort_27.json
[0m
Starting job...


0

In [28]:
!maestro task list

┏━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃   ID ┃ Task        ┃   Registered ┃   Assigned ┃   Pending ┃   Running ┃   Completed ┃   Failed ┃   kill ┃   killed ┃   Broken ┃ Status   ┃
┣━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━━━━━╋━━━━━━━━━━━━╋━━━━━━━━━━━╋━━━━━━━━━━━╋━━━━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━╋━━━━━━━━━━┫
┃    0 ┃ user.test_2 ┃            0 ┃         91 ┃         0 ┃         1 ┃           8 ┃        0 ┃      0 ┃        0 ┃        0 ┃ Running  ┃
┗━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┻━━━━━━━━━━━━┻━━━━━━━━━━━┻━━━━━━━━━━━┻━━━━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━┻━━━━━━━━━━┛


## 6) Cancel all jobs:

In [29]:
!maestro slurm cancel

[32m2024-05-10T12:09:39.156508-0300[0m | [1m    INFO    [0m | [36m            parser            [0m | [34mcancel job 2321...[0m
