## Load Llama-3-8B

# RLHF baselines (Deepspeed Zero)

## RLHF on Llama-3-Instruct (DPO $\beta$=0.01)

In [None]:
# change learning rate 
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 5e-7 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_dpo_beta_0.05_lr_0.5_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 0.05 \
    --loss_type "sigmoid" \
    --num_train_epochs 1

## RLHF on Llama-3-Instruct (IPO $\beta$=0.005)

In [None]:
# IPO with 2048 
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1\
    --per_device_eval_batch_size 1 \
    --learning_rate 5e-7 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_ipo_beta_0.01_lr_0.5_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 0.01 \
    --loss_type "ipo" \
    --num_train_epochs 1

## RLHF on Llama-3-Instruct (ORPO $\beta$=0.1)

In [None]:
# ORPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/orpo_zero3.py \
    --dataset="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_orpo_beta_0.1" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 0.005 \
    --num_train_epochs 1

## RLHF on Llama-3-Instruct (SimPO $\beta$=10)

In [None]:
# SimPO on Llama-3-Instruct
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_simpo_beta_10_hadv_0.3_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 10 \
    --length_normalization True \
    --home_advantage 0.3 \
    --loss_type "sigmoid" \
    --reference_free True \
    --num_train_epochs 1

# Mallows

In [None]:
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_mallows_dpo_beta_10_normalized_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_steps 150 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --loss_type "mallows-dpo" \
    --num_train_epochs 1
    

# KTO

In [None]:
!NCCL_P2P_DISABLE=1 accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 7 examples/scripts/kto_zero3.py \
    --dataset_name="/home/hanyang/RainbowPO/examples/datasets/UltraFeedback_armorm_kto_trl" \
    --model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 5e-7 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 10 \
    --output_dir="models_rlhf/Llama3-Instruct_armorm_kto_beta_0.05_lr_5e-7_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 0.05 \
    --num_train_epochs 1

# CPO

In [None]:
!NCCL_P2P_DISABLE=1 accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 7 examples/scripts/cpo_zero3.py \
    --dataset="/home/hanyang/RainbowPO/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 10 \
    --output_dir="models_rlhf/Llama3-Instruct_armorm_kto_beta_0.05_lr_5e-7_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 0.05 \
    --num_train_epochs 1

# RainbowPO

In [None]:
!accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero3.yaml --num_processes 8 examples/scripts/dpo_zero3.py \
    --dataset_name="<some path>/trl/examples/datasets/UltraFeedback_armorm_trl" \
    --model_name_or_path="<some path>/model_zoo/Meta-Llama-3-8B-Instruct" \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1.0e-6 \
    --gradient_accumulation_steps 32 \
    --gradient_checkpointing True \
    --logging_steps 10 \
    --save_steps 50 \
    --output_dir="<some path>/trl/models_rlhf/Llama3-Instruct_armorm_simpo_beta_10_hadv_0.3_max_token_2048" \
    --optim adamw_torch \
    --max_length 2048 \
    --max_prompt_length 1800 \
    --seed 42 \
    --bf16 \
    --warmup_ratio 0.1 \
    --logging_first_step \
    --no_remove_unused_columns \
    --attn_implementation 'flash_attention_2' \
    --beta 10 \
    --length_normalization True \
    --home_advantage 0.3 \
    --loss_type "mallows_dpo" \
    --if_mixing_alpha True \
    --mixing_alpha 0.5 \
    --reference_free False \
    --num_train_epochs 1