Rl training (#2206)

- Code to run RL - Custom PPO trainer with correct formatting for RM model - Change imports until they work - Some refactoring --------- Co-authored-by: Andreas Koepf <andreas.koepf@provisio.com> Co-authored-by: Dimitri <dimitrivr@icloud.com>
LAION-AI · Apr 24, 2023 · 4f02095 · 4f02095
1 parent a9f17de
commit 4f02095
Show file tree

Hide file tree

Showing 33 changed files with 1,489 additions and 221 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -79,6 +79,33 @@
       "env": {
         "CUDA_VISIBLE_DEVICES": "0"
       }
+    },
+    {
+      "name": "Debug RLHF",
+      "type": "python",
+      "request": "launch",
+      "cwd": "${workspaceFolder}/model/model_training",
+      "module": "accelerate.commands.launch",
+      "args": [
+        "--main_process_port",
+        "29506",
+        "--config_file",
+        "configs/accelerate_config.yaml",
+        "--num_processes",
+        "5",
+        "trainer_rl.py",
+        "--configs",
+        "defaults",
+        "defaults_rlhf",
+        "pythia_rlhf",
+        "oasst_export_latin_cyrillic_rlhf"
+      ],
+      "console": "integratedTerminal",
+      "justMyCode": false,
+      "env": {
+        "CUDA_VISIBLE_DEVICES": "1,2,3,4,5",
+        "OMP_NUM_THREADS": "1"
+      }
     }
   ]
 }
diff --git a/model/model_training/.gitignore b/model/model_training/.gitignore
@@ -6,4 +6,12 @@ saved_model*
 .save_model*
 llama_model
 models-*
+model_store_*
+singularity
+tritonserver-pyt.sif
+ckpts_pythia12b
+model_rl
 .saved
+.triton_models
+triton_models
+ckpts
diff --git a/model/model_training/README.md b/model/model_training/README.md
@@ -111,12 +111,43 @@ other default options (as given by `defaults_rm`) with the model specific ones.
 
 ## Training with RL
 
-To train using trlx try:
+To train using trlx you first need to install singularity from
+https://github.com/sylabs/singularity/blob/main/INSTALL.md.
+
+Assumes access to a server with 8 GPUs.
+
+Then:
 
 ```bash
-python trainer_rl.py --configs defaults_rlhf
+singularity build --sandbox tritonserver-pyt.sif docker://nvcr.io/nvidia/tritonserver:22.08-pyt-python-py3
 ```
 
+Process a trained RM model to use in a tritonserver
+
+```bash
+python to_triton.py --configs pythia_rlhf --triton_mode rm
+python to_triton.py --configs pythia_rlhf --triton_mode sft
+```
+
+We can know launch the container instance that runs the RM on a specified GPU
+
+```bash
+SINGULARITYENV_CUDA_VISIBLE_DEVICES=7 singularity run --nv --bind .triton_models/model_store_rm:/model_store tritonserver-pyt.sif tritonserver --model-repository=/model_store --http-port 8001 --grpc-port 8002 --metrics-port 8003
+SINGULARITYENV_CUDA_VISIBLE_DEVICES=6 singularity run --nv --bind .triton_models/model_store_sft:/model_store tritonserver-pyt.sif tritonserver --model-repository=/model_store --http-port 8004 --grpc-port 8005 --metrics-port 8006
+```
+
+Finally, we can train using PPO:
+
+```bash
+export TRITON_HOST_RM=localhost:8002/<RM_MODEL_NAME>
+export TRITON_HOST_REF=localhost:8005/<REF_MODEL_NAME>
+
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 OMP_NUM_THREADS=1 accelerate launch --main_process_port 29501 --config_file configs/accelerate_config.yaml --num_processes 6 trainer_rl.py --configs defaults defaults_rlhf pythia_rlhf oasst_export_latin_cyrillic_rlhf
+```
+
+Note: `--num_processes` must be equal to the number of GPUs used for training.
+
 ## Test your model
 
 You can interactively test your model like this:
@@ -225,9 +256,8 @@ python check_dataset_counts.py --datasets webgpt squad_v2 --mode sft
 Experimental results in wandb
 [here](https://wandb.ai/sanagnos/supervised-finetuning?workspace=user-sanagnos).
 
-## TODOS
+## TODOs
 
-- Decide on a model
-- Merge utils etc with reward model
-- Casual Modelling for GPT-JT does not leverage the bidirectional mask for the
-  prompt? (https://huggingface.co/togethercomputer/GPT-JT-6B-v1)
+- recreate init in trainer that does not load the ref_model, currently hard
+  coded
+- same for not loading the self.tokenizer in AccelerateRLTrainer
diff --git a/model/model_training/check_dataset_counts.py b/model/model_training/check_dataset_counts.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 import yaml
-from model_training.utils import _strtobool, get_dataset
+from model_training.utils.utils import _strtobool, get_dataset
 
 
 class Mode(Enum):

diff --git a/model/model_training/configs/accelerate_config.yaml b/model/model_training/configs/accelerate_config.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: /mnt/data/Open-Assistant-RLHF/model/model_training/configs/deepspeed_rl.json
+  zero3_init_flag: false
+distributed_type: DEEPSPEED
+downcast_bf16: "no"
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 3
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/model/model_training/configs/config_rl.yaml b/model/model_training/configs/config_rl.yaml
@@ -1,30 +1,86 @@
 defaults_rlhf:
   rng_seed: 0xa1221f97
-  dataset:
-  rank_model: TODO
-  sft_model: TODO
-  eval_prompts:
-  batch_size: 18
-  epochs: 10
+  datasets: []
+  batch_size: 1
+  chunk_size: 1
+  num_rollouts: 16
+  total_steps: 10000
+  datasets_extra: []
+  cache_dir: /home/ubuntu/data_cache
+  output_dir: model_rl
+  eval_size: 500
+  num_eval_prompts: 64
+  rank_config:
+  sft_config:
+  debug: false
+  dtype: fp16
+
+oasst_export_latin_cyrillic_rlhf:
   datasets:
-    - oa_private:
-        data_path: .cache
-        split: rl
-        val_split: 0.0
-        fraction: 1
-        file: 2023-02-12_oasst_prod.jsonl
-  cache_dir: .cache
-  quantization: false
-  seq2seqmodel: false
+    - oasst_export:
+        #lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
+        lang: "en"
+        #top_k: 2
+        #input_file_path: 2023-03-25_oasst_research_ready_synth_labels.jsonl.gz
+        input_file_path: 2023-04-08_ready_with_prompt_lottery_en.jsonl.gz
+  sort_by_length: false
+  use_custom_sampler: false
+
+pythia_rlhf:
+  triton_host_rm: localhost:8002/andreaskoepf-oasst-rm-2-pythia-1.4b-10000
+  triton_host_sft: localhost:8005/andreaskoepf-oasst-sft-4-pythia-12b-epoch-3.5
+  rank_config:
+    is_reward_model: true
+    model_name: andreaskoepf/oasst-rm-2-pythia-1.4b-10000 # just for tokenizer...andreaskoepf/oasst-rm-1-pythia-1b
+    cache_dir: /mnt/data/Open-Assistant-RLHF/model/model_training/.cache
+    pooling: last
+    residual_dropout: 0.01
+    use_flash_attention: false
+    dtype: fp16
+    batch_size: 8
+
+  sft_config:
+    is_reward_model: false
+    model_name: andreaskoepf/oasst-sft-4-pythia-12b-epoch-3.5
+    # model_name: andreaskoepf/pythia-1.4b-gpt4all-pretrain
+    # model_name: OpenAssistant/llama_30b_sft-2_test
+    cache_dir: /mnt/data/Open-Assistant-RLHF/model/model_training/.cache
+    quantization: false
+    seq2seqmodel: false
+    freeze_layer:
+    num_layers_unfrozen: -1
+    residual_dropout: 0.2
+    use_flash_attention: false
+    dtype: fp16
+    batch_size: 1
+
+llama_rlhf:
+  triton_host_rm: localhost:8002/OpenAssistant-oasst-rm-2-pythia-6.9b-epoch-1
+  triton_host_sft: localhost:8005/OpenAssistant-oasst-sft-7e2-llama-30b
+  rank_config:
+    is_reward_model: true
+    model_name: OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1
+    cache_dir: /mnt/data/Open-Assistant-RLHF/model/model_training/.cache
+    pooling: last
+    residual_dropout: 0.0
+    use_flash_attention: false
+    dtype: fp16
+    batch_size: 8
 
-pythia_2b_rlhf:
-  model_name: EleutherAI/pythia-1b-deduped-base-finetuned/checkpoint-2000
-  rank_model: ../reward/instructor/microsoft/deberta-v3-large-finetuned/checkpoint-4500
-  sft_model: EleutherAI/pythia-1b-deduped-base-finetuned/checkpoint-2000
-  batch_size: 18
+  sft_config:
+    is_reward_model: false
+    model_name: OpenAssistant/oasst-sft-7e2-llama-30b
+    cache_dir: /mnt/data/Open-Assistant-RLHF/model/model_training/.cache
+    quantization: false
+    seq2seqmodel: false
+    freeze_layer: 52
+    num_layers_unfrozen: -1 # we dont use this, trlx has its own implementation
+    residual_dropout: 0.0
+    use_flash_attention: true
+    dtype: fp16
 
 debug_rlhf:
-  model_name: gpt2
-  rank_model: /local/home/sanagnos/general/Open-Assistant/model/reward/instructor/facebook/galactica-125m-finetuned/checkpoint-500/
-  sft_model: /local/home/sanagnos/general/Open-Assistant/model/model_training/EleutherAI/pythia-70m-deduped-base-finetuned/checkpoint-20/
+  rank_model: pythia_reward_model/checkpoint-50
+  sft_model: pythia_sft/checkpoint-10/
   batch_size: 2
+  log_dir: test
diff --git a/model/model_training/configs/deepspeed_rl.json b/model/model_training/configs/deepspeed_rl.json
@@ -0,0 +1,29 @@
+{
+  "fp16": {
+    "enabled": "true",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": true,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    }
+  },
+  "gradient_accumulation_steps": "8",
+  "train_micro_batch_size_per_gpu": "auto",
+  "train_batch_size": "auto",
+  "gradient_clipping": "auto",
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
diff --git a/model/model_training/configs/deepspeed_rl_zero3.json b/model/model_training/configs/deepspeed_rl_zero3.json
@@ -0,0 +1,33 @@
+{
+  "fp16": {
+    "enabled": "true",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1e9,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1e9,
+    "contiguous_gradients": true,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    }
+  },
+  "gradient_accumulation_steps": "8",
+  "train_micro_batch_size_per_gpu": "auto",
+  "train_batch_size": "auto",
+  "gradient_clipping": "auto",
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
diff --git a/model/model_training/configs/ppo_config.yaml b/model/model_training/configs/ppo_config.yaml
@@ -1,54 +1,61 @@
 train:
-  seq_length: 500
-  epochs: 100
+  seq_length: 2032
+  epochs: 10000
   total_steps: 10000
   batch_size: 18
-  checkpoint_interval: 10000
-  eval_interval: 500
-  pipeline: "PromptPipeline"
-  trainer: "AcceleratePPOTrainer"
+  checkpoint_interval: 1000
+  eval_interval: 250
+  pipeline: "CustomPromptPipeline"
+  trainer: "CustomPPOTrainer"
+  tracker: wandb
+  project_name: rlhf
+  entity_name: open-assistant
 
 model:
-  model_path: "EleutherAI/pythia-1b-deduped-base-finetuned/checkpoint-500"
-  num_layers_unfrozen: 2
+  model_path:
+  model_arch_type: causal
 
 tokenizer:
-  tokenizer_path: "EleutherAI/pythia-1b-deduped-base-finetuned/checkpoint-500"
-  truncation_side: "right"
+  tokenizer_path:
+  truncation_side: "left" # AKo: changed this to "left", otherwise it sees only the beginning of conversations
+  padding_side: "left" # use with rotary positional embedidngs
 
 optimizer:
   name: "adamw"
   kwargs:
-    lr: 1.0e-5
+    lr: 1.0e-6
     betas: [0.9, 0.95]
     eps: 1.0e-8
     weight_decay: 1.0e-6
 
 scheduler:
   name: "cosine_annealing"
   kwargs:
-    T_max: 10000 # train.total_steps
-    eta_min: 1.0e-4
+    #T_max: 10000 # train.total_steps
+    #eta_min: 1.0e-4
+    T_max: 10000
+    eta_min: 1.0e-6
 
 method:
   name: "ppoconfig"
-  num_rollouts: 384
-  chunk_size: 8
+  num_rollouts: 1024
+  chunk_size: 4
   ppo_epochs: 4
-  init_kl_coef: 0.05
+  init_kl_coef: 0.1
   target: 6
   horizon: 10000
   gamma: 1
   lam: 0.95
-  cliprange: 0.2
-  cliprange_value: 0.2
+  cliprange: 0.4
+  cliprange_value: 0.4
   vf_coef: 1
   scale_reward: False
   ref_mean: null
   ref_std: null
-  cliprange_reward: 10
+  cliprange_reward: 4
   gen_kwargs:
-    max_new_tokens: 40
+    max_new_tokens: 512
     top_k: 0
-    top_p: 1.0
+    top_p: 0.7
     do_sample: True
+    temperature: 1.0