# Source
# https://huggingface.co/blog/mlabonne/orpo-llama-3

In [1]:
import gc
import os

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


# Configuration

In [2]:
access_token = "hf_XXX"

In [3]:
# Flash attention
attn_implementation = "flash_attention_2"
torch_dtype = torch.bfloat16

N_EPOCHS = 1

# Model
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "Custom-OrpoLlama-3-8B"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Model and Tokenizer setup

In [4]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    token=access_token)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
    token=access_token
)

model, tokenizer = setup_chat_format(model, tokenizer)

model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.57s/it]


# Dataset preparation

In [5]:
dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(1000))

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

# Process entries when called to fit the correct template
dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.001)

# Training

## Configure Trainer

In [6]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=N_EPOCHS,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    warmup_steps=10,
    output_dir="./results/",
    remove_unused_columns=False,
)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer
)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 999/999 [00:02<00:00, 419.78 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 203.72 examples/s]


## Train model

In [7]:
trainer.train()
trainer.save_model(new_model)

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
10,3.3631,1.389156,0.8474,1.18,1.18,-0.11868,-0.133936,1.0,0.015255,-1.339357,-1.186802,-1.159615,-0.996206,1.329918,-0.592375,0.212849
20,2.6328,1.36172,0.7771,1.287,1.287,-0.11606,-0.130229,1.0,0.014169,-1.302286,-1.160596,-1.217011,-1.018003,1.301916,-0.598041,0.200217
30,1.4413,1.330022,0.7634,1.31,1.31,-0.113001,-0.126362,1.0,0.013361,-1.26362,-1.13001,-1.245288,-1.041515,1.269828,-0.60194,0.191575
40,1.1339,1.279973,0.8153,1.226,1.226,-0.108497,-0.121349,1.0,0.012852,-1.213489,-1.084972,-1.268364,-1.072785,1.219628,-0.603446,0.188249
50,1.2197,1.205585,0.82,1.22,1.22,-0.103205,-0.116395,1.0,0.013191,-1.163955,-1.032045,-1.270852,-1.074382,1.145684,-0.599009,0.198069
60,1.063,1.156024,0.7945,1.259,1.259,-0.098424,-0.11252,1.0,0.014097,-1.125205,-0.984238,-1.253186,-1.053992,1.096949,-0.590745,0.2165
70,1.0659,1.113467,0.9099,1.099,1.099,-0.094146,-0.108651,1.0,0.014505,-1.086506,-0.941457,-1.249596,-1.03954,1.054898,-0.585689,0.227868
80,1.0635,1.089055,0.7708,1.297,1.297,-0.091766,-0.10596,1.0,0.014193,-1.059599,-0.917665,-1.243919,-1.027465,1.030416,-0.58639,0.226288
90,1.0731,1.074691,0.8269,1.209,1.209,-0.09068,-0.104449,1.0,0.013769,-1.044486,-0.906799,-1.241005,-1.026939,1.015827,-0.588641,0.221223
100,1.0048,1.056785,0.7825,1.278,1.278,-0.089487,-0.103486,1.0,0.013999,-1.034861,-0.894873,-1.241864,-1.032764,0.998151,-0.586332,0.226419



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.


# Inference

In [8]:
# Flush memory
del trainer, model
gc.collect()
torch.cuda.empty_cache()

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.51s/it]


In [None]:
_ = model.push_to_hub(new_model, use_temp_dir=False, token=access_token)
_ = tokenizer.push_to_hub(new_model, use_temp_dir=False, token=access_token)