In [45]:
import argilla as rg
from dotenv import load_dotenv
import os

load_dotenv()

True

In [46]:
ARGILLA_SPACE_URL = (
    "https://dibt-demo-argilla-space.hf.space"  # The URL for the Argilla space
)
ARGILLA_DATASET_NAME = "aya_dutch_dpo"  # The dataset name in the Argilla space
ARGILLA_WORKSPACE_NAME = "admin"
ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")

# Check if the API key is set
assert (
    ARGILLA_API_KEY is not None
), "Please set the ARGILLA_API_KEY environment variable or pass it as a parameter"

In [47]:
rg.init(
    api_url=ARGILLA_SPACE_URL, api_key=ARGILLA_API_KEY, workspace=ARGILLA_WORKSPACE_NAME
)

In [48]:
argilla_ds = rg.FeedbackDataset.from_argilla(ARGILLA_DATASET_NAME)

In [49]:
argilla_ds

RemoteFeedbackDataset(
   id=6a6fdd31-6ce5-4f32-9131-af1c8e33987c
   name=aya_dutch_dpo
   workspace=Workspace(id=2d3f0ffc-b5f5-43e6-a36b-352d0685df7b, name=admin, inserted_at=2024-04-19 10:17:00.524980, updated_at=2024-04-19 10:17:00.524980)
   url=https://dibt-demo-argilla-space.hf.space/dataset/6a6fdd31-6ce5-4f32-9131-af1c8e33987c/annotation-mode
   fields=[RemoteTextField(id=UUID('eae01c1b-acc9-482b-a1ba-f4addcd86b1c'), client=None, name='id', title='id', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('c03820ea-d6a3-499e-89e6-13cb92cf49e4'), client=None, name='instruction', title='instruction', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('8ae32a6a-9389-4b86-bb88-16982597373e'), client=None, name='generations-0', title='generations-0', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('6dcf775a-bea7-4d97-bd76-7db971e4680a'), client=None, name='generations-1', title='generations-1', required=False, type='te

In [78]:
ds = argilla_ds.format_as('datasets')

In [79]:
ds

Dataset({
    features: ['id', 'instruction', 'generations-0', 'generations-1', 'generations-0-rating', 'generations-0-rating-suggestion', 'generations-0-rating-suggestion-metadata', 'generations-0-rationale', 'generations-0-rationale-suggestion', 'generations-0-rationale-suggestion-metadata', 'generations-1-rating', 'generations-1-rating-suggestion', 'generations-1-rating-suggestion-metadata', 'generations-1-rationale', 'generations-1-rationale-suggestion', 'generations-1-rationale-suggestion-metadata', 'improved_response', 'improved_response-suggestion', 'improved_response-suggestion-metadata', 'external_id', 'metadata'],
    num_rows: 1360
})

In [80]:
ds[0]

{'id': 'fd8a365561542e91728ceea4c987a8ae63144d9a5dda20dac0a93c71d2916d1d',
 'instruction': 'Schrijf een voortzetting van de paragraaf: Op de Olympische Zomerspelen van 1992 in Barcelona, Spanje strandde Heyns in de series van de 100 meter schoolslag. Samen met Jill Brukman, Jeanine Steenkamp en Marianne Kriel werd ze uitgeschakeld in de series van de 4x100 meter wisselslag. Tijdens de Gemenebestspelen 1994',
 'generations-0': 'in Victoria, Canada sleepte de Zuid-Afrikaanse de bronzen medaille in de wacht op de 100 meter schoolslag. In Rome, Italië nam Heyns deel aan de Wereldkampioenschappen zwemmen 1994, op dit toernooi eindigde ze als zesde op de 100 meter schoolslag en als dertiende op de 200 meter schoolslag. Op de Pan Pacific kampioenschappen zwemmen 1995 in Atlanta, Verenigde Staten veroverde de Zuid-Afrikaanse de gouden medaille op de 100 meter schoolslag en de zilveren medaille op de 200 meter schoolslag. Tijdens de Olympische Zomerspelen 1996 in Atlanta, Verenigde Staten verov

In [81]:
ds = ds.filter(lambda x: len(x['generations-1'])>0)
ds = ds.filter(lambda x: x['generations-0-rating-suggestion'] is not None or [])
ds = ds.filter(lambda x: x['generations-1-rating-suggestion'] is not None or [])
ds

Filter:   0%|          | 0/1360 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1360 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1356 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'instruction', 'generations-0', 'generations-1', 'generations-0-rating', 'generations-0-rating-suggestion', 'generations-0-rating-suggestion-metadata', 'generations-0-rationale', 'generations-0-rationale-suggestion', 'generations-0-rationale-suggestion-metadata', 'generations-1-rating', 'generations-1-rating-suggestion', 'generations-1-rating-suggestion-metadata', 'generations-1-rationale', 'generations-1-rationale-suggestion', 'generations-1-rationale-suggestion-metadata', 'improved_response', 'improved_response-suggestion', 'improved_response-suggestion-metadata', 'external_id', 'metadata'],
    num_rows: 1352
})

In [85]:
def formatted_as_messages(prompt, completion):
    return [{"role": "user", "content": prompt}, {"role": "assistant", "content": completion}]

If we don't have any additional annotations for our preferences and we're just relying on the feedback LM then we can use this function to format the dataset for DPO/ORPO. This function will grab the highest rated generation and set that as chosen. If there is a tie in the ratings then the `tie` value will be set as `True`. We can then decide to filter out the ties or keep them in the dataset.

In [86]:
def format_for_dpo(row, format_messages=True):
    prompt = row["instruction"]
    generation_0 = row.get("generations-0")
    generation_1 = row.get("generations-1")
    generation_0_ranking = row.get("generations-0-rating-suggestion")
    generation_1_ranking = row.get("generations-1-rating-suggestion")
    tie = False
    # take max of generations-0-rating-suggestion and generations-1-rating-suggestion
    if generation_0_ranking > generation_1_ranking:
        chosen = generation_0
        rejected = generation_1
    if generation_0_ranking < generation_1_ranking:
        chosen = generation_1
        rejected = generation_0
    if generation_0_ranking == generation_1_ranking:
        chosen = generation_0
        rejected = generation_1
        tie = True

    if not format_messages:
        return {"chosen": chosen, "rejected": rejected, "tie": tie, "prompt": prompt}
    chosen = formatted_as_messages(prompt, chosen)
    rejected = formatted_as_messages(prompt, rejected)
    return {"chosen": chosen, "rejected": rejected, "tie": tie}


In [87]:
ds_messages = ds.map(
    format_for_dpo,
    remove_columns=[
        column
        for column in ds.column_names
        if column not in {"chosen", "rejected", "id", "tie"}
    ],
    fn_kwargs={"format_messages": True},
)
ds

Map:   0%|          | 0/1352 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'instruction', 'generations-0', 'generations-1', 'generations-0-rating', 'generations-0-rating-suggestion', 'generations-0-rating-suggestion-metadata', 'generations-0-rationale', 'generations-0-rationale-suggestion', 'generations-0-rationale-suggestion-metadata', 'generations-1-rating', 'generations-1-rating-suggestion', 'generations-1-rating-suggestion-metadata', 'generations-1-rationale', 'generations-1-rationale-suggestion', 'generations-1-rationale-suggestion-metadata', 'improved_response', 'improved_response-suggestion', 'improved_response-suggestion-metadata', 'external_id', 'metadata'],
    num_rows: 1352
})

In [89]:
ds_messages[0]

{'id': 'fd8a365561542e91728ceea4c987a8ae63144d9a5dda20dac0a93c71d2916d1d',
 'chosen': [{'content': 'Schrijf een voortzetting van de paragraaf: Op de Olympische Zomerspelen van 1992 in Barcelona, Spanje strandde Heyns in de series van de 100 meter schoolslag. Samen met Jill Brukman, Jeanine Steenkamp en Marianne Kriel werd ze uitgeschakeld in de series van de 4x100 meter wisselslag. Tijdens de Gemenebestspelen 1994',
   'role': 'user'},
  {'content': 'in Victoria, Canada sleepte de Zuid-Afrikaanse de bronzen medaille in de wacht op de 100 meter schoolslag. In Rome, Italië nam Heyns deel aan de Wereldkampioenschappen zwemmen 1994, op dit toernooi eindigde ze als zesde op de 100 meter schoolslag en als dertiende op de 200 meter schoolslag. Op de Pan Pacific kampioenschappen zwemmen 1995 in Atlanta, Verenigde Staten veroverde de Zuid-Afrikaanse de gouden medaille op de 100 meter schoolslag en de zilveren medaille op de 200 meter schoolslag. Tijdens de Olympische Zomerspelen 1996 in Atlanta

In [91]:
ds_not_formatted = ds.map(
    format_for_dpo,
    remove_columns=[
        column
        for column in ds.column_names
        if column not in {"chosen", "rejected", "id", "tie", "prompt"}
    ],
    fn_kwargs={"format_messages": False},
)
ds_not_formatted

Map:   0%|          | 0/1352 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'chosen', 'rejected', 'tie', 'prompt'],
    num_rows: 1352
})

In [92]:
ds_messages = ds_messages.filter(lambda row: row["tie"] is False)
ds_not_formatted = ds_not_formatted.filter(lambda row: row["tie"] is False)

Filter:   0%|          | 0/1352 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1352 [00:00<?, ? examples/s]

In [93]:
ds_messages = ds_messages.remove_columns("tie")
ds_not_formatted = ds_not_formatted.remove_columns("tie")

In [95]:
ds_messages.push_to_hub("DIBT/aya_dutch_dpo")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DIBT/aya_dutch_dpo/commit/99ff668463c4af929865a387c6eae6326cc1d0f2', commit_message='Upload dataset', commit_description='', oid='99ff668463c4af929865a387c6eae6326cc1d0f2', pr_url=None, pr_revision=None, pr_num=None)

## Formatting using additional preference annotations

If we have additional annotations for our preferences then we can use this function to format the dataset for DPO/ORPO. This function will look to see if there are any human ratings and use that if it's present. If either or both ratings have not been completed by the human raters it will use the LLM rating. If there is a tie in the ratings then the `tie` value will be set as `True`. We can then decide to filter out the ties or keep them in the dataset.


In [44]:
def process_dataset(row):
    prompt = row['instruction']
    generation_0 = row.get('generations-0')
    generation_1 = row.get('generations-1')
    # If we have non empty rating from the user, we calculate the average rating and use this as the rating
    if row["generations-0-rating"]:
        generations_0_rating = sum(row["generations-0-rating"]) / len(row["generations-0-rating"])
    else:
        generations_0_rating = row['generations-0-rating-suggestion']
    if row["generations-1-rating"]:
        generations_1_rating = sum(row["generations-1-rating"]) / len(row["generations-1-rating"])
    else:
        generations_1_rating = row['generations-1-rating-suggestion']
    tie = False
    # take max of generations-0-rating-suggestion and generations-1-rating-suggestion
    if generations_0_rating > generations_1_rating:
        chosen = generation_0
        rejected = generation_1
    if generations_0_rating < generations_1_rating:
        chosen = generation_1
        rejected = generation_0
    if generations_0_rating == generations_1_rating:
        chosen = generation_0
        rejected = generation_1
        tie = True
    # Format the data using messages format
    prompt = row["instruction"]
    result = {"chosen": formatted_as_messages(prompt, chosen)}
    result["rejected"] = formatted_as_messages(prompt,rejected)
    result["tie"] = tie
    return result