## Load Llama-3-8B

In [None]:
# Warning: Using transformer version in DPO will lead to errors of loading Llama3
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
transformers.logging.set_verbosity_error()
from trl import setup_chat_format

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"  # Replace with the actual model name
# save_directory = "/home/hanyang/Models/"  # Replace with your desired save directory

# Download and save the tokenizer to the specified directory
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Download and save the model to the specified directory
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
tokenizer.chat_template

In [None]:
from typing import Optional, Literal

LLaMa3_CHAT_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",


# RLHF baselines (Deepspeed Zero)

## RLHF on Llama-3-Instruct (IPO $\beta$=0.1)

In [None]:
# IPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_ipo_beta_0.001" \
    --optim adamw_torch \
    --max_length 1024 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

In [None]:
# IPO with length normalization
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_ipo_beta_10_normalized" \
    --optim adamw_torch \
    --max_length 1024 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

In [None]:
# IPO with half learning rate
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --learning_rate 5e-7 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_ipo_beta_0.001_lr_0.5" \
    --optim adamw_torch \
    --max_length 1024 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

In [None]:
# IPO with 2048 
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1\
    --per_device_eval_batch_size 1 \
    --learning_rate 1e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_ipo_beta_10_LN_hadv_0.2_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

## RLHF on Llama-3-Instruct (ORPO $\beta$=0.1)

In [None]:
# ORPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/orpo_zero3.py \
    --dataset="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_orpo_beta_0.1" \
    --optim adamw_torch \
    --max_length 1024 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

## RLHF on Llama-3-Instruct (CPO $\beta$=0.1)

In [None]:
# CPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_cpo_beta_0.1" \
    --optim adamw_torch \
    --max_length 1024 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused \
    --attn_implementation 'flash_attention_2'

## RLHF on Llama-3-Instruct (SimPO $\beta$=10)

In [None]:
# SimPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_simpo_beta_10_hadv_0_max_token_2048_wr_0.1" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

In [None]:
# SimPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_simpo_beta_10_hadv_0.2_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150\
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

# Mallows

In [None]:
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_mallows_dpo_beta_10_normalized_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1000 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'
    

In [None]:
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3_offload.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_mallows_simpo_beta_10_normalized_hadv_0_max_token_2048_wr_0.1" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'

## Cauchy

In [None]:
accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 5e-7 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50000 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_cauchypo_beta_10_normalized_max_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2'