In [10]:
#| default_exp run_bench

In [11]:
#|export
from fastcore.script import call_parse, Param, store_true
from random import randint
import os, subprocess

In [15]:
#|export
@call_parse
def gen_cli(debug:Param('Print command instead of running it', store_true),
            ds_stg:Param('The deepspeed stage', int, choices=[0,1,2,3])=3,
            n_gpu:Param('number of GPUs', int, choices=[1,2,3])=1,
            gc:Param('Toggle gradient checkpointing', choices=['True', 'False'])='True',  
            seq_len:Param('Sequence length', int, choices=[64, 200,512, 1024, 2048])=256,
            bs:Param('Batch size', int, choices=[1,3,4,6,8,16,32,64,100,200])=1,
            model_sz:Param('Model size in Billions', int, choices=[3, 7, 13, 34])=7,
            n_epochs:Param('# of epochs', int) = 1,
            qaunt4:Param('Enable 4bit quantization', choices=['True', 'False'])='False'):
    "Generate Training CLI Command"

    model_id = {3:'pankajmathur/orca_mini_3b', 7:'NousResearch/Llama-2-7b-hf', 13: 'NousResearch/Llama-2-13b-hf', 34: 'NousResearch/CodeLlama-34b-hf'}[model_sz]
    nr = randint(10000000,99999999)
    env_values = [('WANDB_ENTITY', 'hamelsmu'), ('WANDB_PROJECT', 'deepspeed-data'),
                  ('WANDB_RUN_ID', f'z{ds_stg}-n_gpu{n_gpu}-gc{gc}-seq_len{seq_len}-bs{bs}-model_sz{model_sz}-quant4{qaunt4}-{nr}')]
    env_str = ''
    for v in env_values:
        env_str  += f'{v[0]}={v[1]} '
    
    cmd = f"""torchrun --nproc_per_node {n_gpu} run_lora.py \
  --model_id {model_id} \
  --dataset_path data_{seq_len} \
  --quant4 {qaunt4} \
  --output_dir {model_id}-fa \
  --num_train_epochs {n_epochs} \
  --per_device_train_batch_size {bs} \
  --learning_rate 4e-3 \
  --save_strategy no \
  --gradient_checkpointing {gc} \
  --bf16 True \
  --tf32 True \
  --lr_scheduler_type constant_with_warmup \
  --logging_steps 25 \
  --report_to wandb \
  --deepspeed z{ds_stg}.json"""

    full_cmd = env_str + ' ' + cmd

    if debug:
        print(full_cmd)
    else:
        env_vars = os.environ.copy()
        for v in env_values:
            env_vars[v[0]] = v[1]
        print(f"running command:\n{full_cmd}")
        return subprocess.run(cmd.split(), env=env_vars, check=True)

In [16]:
from nbdev.export import nb_export
nb_export('run_bench.ipynb', lib_path='.', name='run_bench')

In [17]:
!python run_bench.py --help

usage: run_bench.py [-h] [--debug] [--ds_stg {0,1,2,3}] [--n_gpu {1,2,3}]
                    [--gc {True,False}] [--seq_len {64,200,512,1024,2048}]
                    [--bs {1,3,4,6,8,16,32,64,100,200}] [--model_sz {3,7,13,34}]
                    [--n_epochs N_EPOCHS] [--qaunt4 {True,False}]

Generate Training CLI Command

options:
  -h, --help                         show this help message and exit
  --debug                            Print command instead of running it
                                     (default: False)
  --ds_stg {0,1,2,3}                 The deepspeed stage (default: 3)
  --n_gpu {1,2,3}                    number of GPUs (default: 1)
  --gc {True,False}                  Toggle gradient checkpointing (default:
                                     True)
  --seq_len {64,200,512,1024,2048}   Sequence length (default: 256)
  --bs {1,3,4,6,8,16,32,64,100,200}  Batch size (default: 1)
  --model_sz {3,7,13,34}             Model size in Billions (default: 7)
  --n_epoc

In [20]:
%%bash
python run_bench.py --ds_stg 0 --n_gpu 1 --gc True \
--seq_len 64 --bs 1 --model_sz 13 --debug

WANDB_ENTITY=hamelsmu WANDB_PROJECT=deepspeed-data WANDB_RUN_ID=z0-n_gpu1-gcTrue-seq_len64-bs1-model_sz13-quant4False-59016160  torchrun --nproc_per_node 1 run_lora.py   --model_id NousResearch/Llama-2-13b-hf   --dataset_path data_64   --quant4 False   --output_dir NousResearch/Llama-2-13b-hf-fa   --num_train_epochs 1   --per_device_train_batch_size 1   --learning_rate 4e-3   --save_strategy no   --gradient_checkpointing True   --bf16 True   --tf32 True   --lr_scheduler_type constant_with_warmup   --logging_steps 25   --report_to wandb   --deepspeed z0.json


In [22]:
%%bash
# z0-n_gpu1-gcFalse-seq_len512-bs1-model_sz7

python run_bench.py --ds_stg 0 --n_gpu 1 --gc False \
--seq_len 512 --bs 1 --model_sz 7 --qaunt4 True --debug

WANDB_ENTITY=hamelsmu WANDB_PROJECT=deepspeed-data WANDB_RUN_ID=z0-n_gpu1-gcFalse-seq_len512-bs1-model_sz7-quant4True-83868131  torchrun --nproc_per_node 1 run_lora.py   --model_id NousResearch/Llama-2-7b-hf   --dataset_path data_512   --quant4 True   --output_dir NousResearch/Llama-2-7b-hf-fa   --num_train_epochs 1   --per_device_train_batch_size 1   --learning_rate 4e-3   --save_strategy no   --gradient_checkpointing False   --bf16 True   --tf32 True   --lr_scheduler_type constant_with_warmup   --logging_steps 25   --report_to wandb   --deepspeed z0.json


In [26]:
%%bash
# z0-n_gpu1-gcFalse-seq_len64-bs1-model_sz7-quant4True-36346237

python run_bench.py --ds_stg 0 --n_gpu 1 --gc True \
--seq_len 64 --bs 1 --model_sz 7 --qaunt4 True --debug

WANDB_ENTITY=hamelsmu WANDB_PROJECT=deepspeed-data WANDB_RUN_ID=z0-n_gpu1-gcTrue-seq_len64-bs1-model_sz7-quant4True-88368670  torchrun --nproc_per_node 1 run_lora.py   --model_id NousResearch/Llama-2-7b-hf   --dataset_path data_64   --quant4 True   --output_dir NousResearch/Llama-2-7b-hf-fa   --num_train_epochs 1   --per_device_train_batch_size 1   --learning_rate 4e-3   --save_strategy no   --gradient_checkpointing True   --bf16 True   --tf32 True   --lr_scheduler_type constant_with_warmup   --logging_steps 25   --report_to wandb   --deepspeed z0.json
