## Using Direct Preference Optimization on a preference dataset


In [1]:
#%pip install --quiet transformers trl

In [1]:
# MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
MODEL_ID = "Qwen/Qwen2-0.5B-Instruct"   # has to match generate_synthetic_data so that data is in-distribution

import os
from dotenv import load_dotenv
load_dotenv("../keys.env")
assert os.environ["HF_TOKEN"][:2] == "hf",\
       "Please sign up for access to the specific Llama model via HuggingFace and provide access token in keys.env file"

In [2]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer

train_dataset = load_dataset('json', data_files="ad_preference_dataset.jsonl", split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [4]:
if 'llama' in MODEL_ID:
    tokenizer.pad_token = tokenizer.eos_token

In [5]:
training_args = DPOConfig(output_dir="ClassifiedAds-DPO", logging_steps=10, 
                          per_device_train_batch_size=1)
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
trainer.train()

Extracting prompt in train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
10,0.6424
20,0.8231
30,0.8356
40,1.0348
50,0.366
60,1.148
70,1.3475
80,0.3357
90,1.697
100,2.4234


TrainOutput(global_step=300, training_loss=0.49373050103895366, metrics={'train_runtime': 127.5862, 'train_samples_per_second': 2.351, 'train_steps_per_second': 2.351, 'total_flos': 0.0, 'train_loss': 0.49373050103895366, 'epoch': 3.0})

In [6]:
trainer.save_model(training_args.output_dir)

## Try out the trained model

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("ClassifiedAds-DPO")

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if 'llama' in MODEL_ID:
    tokenizer.pad_token = tokenizer.eos_token

In [11]:
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [12]:
system_prompt = "You are writing a one paragraph classified ad for used items in a neighborhood online forum. Write an ad to sell a "
for_sale = "used copy of the book Pachinko by Jin Min Lee for $5"
generator(system_prompt + for_sale, 
          max_length=100)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'You are writing a one paragraph classified ad for used items in a neighborhood online forum. Write an ad to sell a used copy of the book Pachinko by Jin Min Lee for $5. The price includes shipping.\n"Find used copy of Pachinko by Jin Min Lee for $5! Shipping included. Available through our community forum. Need it by September 1st." \n\nIn this ad, you must have also provided details like title, author, genre, and any relevant'}]