In [1]:
account = "plgllmparamgr-gpu-a100"
partition = "plgrid-gpu-a100"

is_plgrid = True  # if we should use the downloaded c4 dataset

do_log = False
# example data
NEPTUNE_PROJECT = "bml/bml"
NEPTUNE_API_TOKEN = '"token here (leave double quotes)"'

In [2]:
files = {}
plgrid = "--is_plgrid" if is_plgrid else "--no-is_plgrid"
neptune = (
    f"NEPTUNE_PROJECT={NEPTUNE_PROJECT}\nNEPTUNE_API_TOKEN={NEPTUNE_API_TOKEN}\n"
    if do_log
    else ""
)
log = "--log" if do_log else ""

In [3]:
files["submit_begin.sub"] = f"""#!/bin/bash
#SBATCH --job-name=basic-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=1
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=1

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=00:10:00

#export OMP_NUM_THREADS=2
#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT

srun python main.py --no-is_dist {plgrid} {log}

"""

with open("submit_begin.sub", "w") as f:
    f.write(files["submit_begin.sub"])

In [4]:
files["submit_torchrun.sub"] = f"""#!/bin/bash
#SBATCH --job-name=torchrun-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=1
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=1

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=00:10:00


#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT

srun torchrun \\
--standalone \\
--nnodes=1 \\
--nproc_per_node=1 \\
main.py {plgrid} {log}

"""

with open("submit_torchrun.sub", "w") as f:
    f.write(files["submit_torchrun.sub"])

In [5]:
files["submit_2gpu.sub"] = f"""#!/bin/bash
#SBATCH --job-name=2gpu-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=2
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=1

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=00:10:00

#export OMP_NUM_THREADS=2
#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT

srun torchrun \\
--nnodes=1 \\
--nproc_per_node=2 \\
--rdzv_id $RANDOM \\
--rdzv_backend c10d \\
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \\
main.py {plgrid} {log}

"""

with open("submit_2gpu.sub", "w") as f:
    f.write(files["submit_2gpu.sub"])

In [6]:
files["sbatch_grid_search.sub"] = f"""#!/bin/bash
#SBATCH --array=2-4
#SBATCH --job-name=grid-search-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=1
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=2

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=01:00:00

#export OMP_NUM_THREADS=2
#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT
id=$RANDOM

srun torchrun \\
--nnodes=2 \\
--nproc_per_node=1 \\
--rdzv_id $id \\
--rdzv_backend c10d \\
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \\
main.py {plgrid} {log} \\
--num_layers 4 --num_heads 4 --d_model 256 --seq_length 256 -bs 128 --dropout 0.0 \\
-lr 1e-$SLURM_ARRAY_TASK_ID -n 8835
# batch size is per gpu so 128

"""

with open("sbatch_grid_search.sub", "w") as f:
    f.write(files["sbatch_grid_search.sub"])

In [7]:
files["sbatch_save.sub"] = f"""#!/bin/bash
#SBATCH --job-name=save-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=2
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=1

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=00:10:00

#export OMP_NUM_THREADS=2
#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT
id=$RANDOM

srun torchrun \\
--nnodes=1 \\
--nproc_per_node=2 \\
--rdzv_id $id \\
--rdzv_backend c10d \\
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \\
main.py {plgrid} -n 100 {log} --save_dir saved-model \\
--log_valid_loss_freq 200 --log_train_loss_freq 5 --early_stop 50

"""

with open("sbatch_save.sub", "w") as f:
    f.write(files["sbatch_save.sub"])

In [8]:
files["sbatch_load.sub"] = f"""#!/bin/bash
#SBATCH --job-name=load-submit
#SBATCH --cpus-per-gpu=8
#SBATCH --gpus-per-task=2
#SBATCH --mem-per-gpu=64G
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=1

#SBATCH --output=R-%x.%j.out
#SBATCH --error=R-%x.%j.err

#SBATCH --partition={partition}
#SBATCH --account={account}
#SBATCH --time=00:10:00

#export OMP_NUM_THREADS=2
#export NCCL_NSOCKS_PERTHREAD=4
#export NCCL_SOCKET_NTHREADS=2
#export NCCL_MIN_NCHANNELS=32

{neptune}

# https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
echo "NODELIST="${{SLURM_NODELIST}}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
export MASTER_PORT=$((10000 + $RANDOM))
echo "MASTER_PORT="$MASTER_PORT
id=$RANDOM

srun torchrun \\
--nnodes=1 \\
--nproc_per_node=2 \\
--rdzv_id $id \\
--rdzv_backend c10d \\
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \\
main.py {plgrid} -n 100 {log} --load_dir saved-model \\
--log_valid_loss_freq 200 --log_train_loss_freq 5

"""

with open("sbatch_load.sub", "w") as f:
    f.write(files["sbatch_load.sub"])

In [9]:
for name, body in files.items():
    with open(name, "w") as f:
        f.write(body)