In [1]:
from datasets import load_dataset

dataset = load_dataset("Intel/orca_dpo_pairs")
dataset['train']

Dataset({
    features: ['system', 'question', 'chosen', 'rejected'],
    num_rows: 12859
})

In [2]:
original_columns = dataset['train'].column_names
original_columns

['system', 'question', 'chosen', 'rejected']

In [3]:
from transformers import AutoTokenizer

model_name = 'teknium/OpenHermes-2.5-Mistral-7B'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
def chatml_format(example):
    # Format system
    if len(example['system']) > 0:
        message = {"role": "system", "content": example['system']}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        system = ""

    # Format instruction
    message = {"role": "user", "content": example['question']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|im_end|>\n"

    # Format rejected answer
    rejected = example['rejected'] + "<|im_end|>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }


train_dataset = dataset['train']
train_dataset = train_dataset.map(
    chatml_format,
    remove_columns=original_columns
)
train_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 12859
})

In [5]:
print(train_dataset[0])

{'chosen': '[\n  ["AFC Ajax (amateurs)", "has ground", "Sportpark De Toekomst"],\n  ["Ajax Youth Academy", "plays at", "Sportpark De Toekomst"]\n]<|im_end|>\n', 'rejected': " Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\n\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\n\nExplanation:\n\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\n\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.<|im_end|>\n", 'prompt': "<|im_start|>user\nYou will be given a definition of a task

In [6]:
from peft import LoraConfig

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

In [7]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from accelerate import cpu_offload

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from trl import DPOTrainer, DPOConfig

output_dir = 'NeuralHermes-2.5-Mistral-7B'
# Training arguments
training_args = DPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    # num_train_epochs=3,  # use this for full training
    max_steps=100,
    save_strategy="no",
    logging_steps=10,
    output_dir=output_dir,
    optim="adamw_8bit",
    warmup_steps=5,
    report_to="none",
    beta=0.1,
    max_prompt_length=384,
    max_length=384,
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
)

[2025-02-02 14:31:24,968] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/usr/bin/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `dlopen'
/usr/bin/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `dlclose'
/usr/bin/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `dlerror'
/usr/bin/ld: /usr/local/cuda-12.1/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status
  dpo_trainer = DPOTrainer(


In [9]:
dpo_trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.9117
20,0.5391
30,0.6307
40,0.624
50,0.5545
60,0.6931
70,0.5545
80,0.486
90,0.5545
100,0.9306


TrainOutput(global_step=100, training_loss=0.7478863000869751, metrics={'train_runtime': 2197.5087, 'train_samples_per_second': 0.182, 'train_steps_per_second': 0.046, 'total_flos': 0.0, 'train_loss': 0.7478863000869751, 'epoch': 0.03110661793296524})