In [1]:
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
dataset = load_dataset("openbmb/UltraFeedback")

In [3]:
from pprint import pprint
pprint(dataset["train"][0])

{'completions': [{'annotations': {'helpfulness': {'Rating': '2',
                                                  'Rationale': 'The response '
                                                               'is clear and '
                                                               'not lengthy, '
                                                               'but it lacks '
                                                               'useful and '
                                                               'comprehensive '
                                                               'information.',
                                                  'Rationale For Rating': 'The '
                                                                          'code '
                                                                          'is '
                                                                          'partially '
                                                     

In [4]:
# Keep only examples from "evol_instruct", "sharegpt" and "false_qa"
dataset = dataset.filter(lambda x: x["source"] in [
    "evol_instruct", "sharegpt", "false_qa"
] and len(x["completions"]) > 0)

len(dataset["train"])

32287

In [5]:
dataset["train"][0]

{'source': 'evol_instruct',
 'instruction': 'Can you write a C++ program that prompts the user to enter the name of a country and checks if it borders the Mediterranean Sea? Here\'s some starter code to help you out:\n#include <iostream>\n#include <string>\nusing namespace std;\nint main() {\n    string country;\n    // prompt user for input\n    cout << "Enter the name of a country: ";\n    cin >> country;\n    // check if country borders the Mediterranean Sea\n    // [C++ code]\n    return 0;\n}',
 'models': ['alpaca-7b', 'pythia-12b', 'starchat', 'vicuna-33b'],
 'completions': [{'annotations': {'helpfulness': {'Rating': '2',
     'Rationale': 'The response is clear and not lengthy, but it lacks useful and comprehensive information.',
     'Rationale For Rating': 'The code is partially incorrect as it checks if the country name ends with "Mediterranean" instead of checking if it borders the Mediterranean Sea, which may cause confusion.',
     'Type': ['1', '3']},
    'honesty': {'Rat

In [6]:
dataset = dataset.remove_columns(["source", "correct_answers", "incorrect_answers"])

In [7]:
def expand(rows):
    result = dict(
        instruction=[],
        models=[],
        completions=[],
    )

    for instructions, models, completions in zip(
        rows["instruction"],
        rows["models"],
        rows["completions"],
    ):
        result["instruction"].extend([instructions] * len(models))
        result["models"].extend(models)
        result["completions"].extend(completions)

    return result

In [8]:
dataset = dataset.map(expand, batched=True)

In [9]:
all_models = set(dataset["train"]["models"])
all_models

{'alpaca-7b',
 'bard',
 'falcon-40b-instruct',
 'gpt-3.5-turbo',
 'gpt-4',
 'llama-2-13b-chat',
 'llama-2-70b-chat',
 'llama-2-7b-chat',
 'mpt-30b-chat',
 'pythia-12b',
 'starchat',
 'ultralm-13b',
 'ultralm-65b',
 'vicuna-33b',
 'wizardlm-13b',
 'wizardlm-70b',
 'wizardlm-7b'}

In [10]:
models_to_keep = {
    # 'alpaca-7b',
    'bard',
    'falcon-40b-instruct',
    'gpt-3.5-turbo',
    'gpt-4',
    # 'llama-2-13b-chat',
    'llama-2-70b-chat',
    # 'llama-2-7b-chat',
    # 'mpt-30b-chat',
    # 'pythia-12b',
    # 'starchat',
    # 'ultralm-13b',
    'ultralm-65b',
    # 'vicuna-33b',
    # 'wizardlm-13b',
    'wizardlm-70b',
    # 'wizardlm-7b',
}

In [11]:
dataset = dataset.filter(lambda x: (
    x["models"] in models_to_keep
    and x["completions"]["model"] == "gpt-4"
))
dataset

Filter:   0%|          | 0/129148 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'models', 'completions'],
        num_rows: 8337
    })
})

In [14]:
dataset = dataset\
    .rename_column("instruction", "example")\
    .rename_column("completions", "feedback")\
    .remove_columns(["models"])

In [15]:
def yamlify(row):
    import yaml

    feedback = row["feedback"]
    
    # Remove unnecessary keys
    feedback.pop("principle", None)
    feedback.pop("overall_score", None)
    feedback.pop("model", None)
    feedback.pop("rationale", None)

    # Get response
    response = feedback.pop("response", None)

    critique = feedback["critique"]
    annotations = feedback["annotations"]

    # Clean up annotations
    for key in annotations:
        annotations[key].pop("Type", None)

    return dict(
        response=response,
        critique=critique,
        annotations=yaml.dump(annotations),
    )

In [16]:
dataset = dataset.map(yamlify, num_proc=16).remove_columns(["feedback"])
dataset

Map (num_proc=16):   0%|          | 0/8337 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['example', 'response', 'critique', 'annotations'],
        num_rows: 8337
    })
})

In [19]:
dataset["train"][1222]

{'example': 'Give 10 examples of adjectives that describe a person.',
 'response': "1. Compassionate: A person who is caring, empathetic, and concerned for others' well-being.\n\n2. Ambitious: Someone who is driven, goal-oriented, and eager to succeed in their personal and professional pursuits.\n\n3. Charismatic: A person who is charming, captivating, and has the ability to easily attract people with their personality.\n\n4. Diligent: Someone who is hard-working, dedicated, and committed to completing tasks with high attention to detail.\n\n5. Optimistic: A person who sees the positive in situations, stays focused on solutions, and maintains a hopeful outlook on life.\n\n6. Creative: Someone who is imaginative, innovative, and enjoys exploring original ideas and expressing them through various mediums.\n\n7. Assertive: A person who confidently expresses their thoughts and desires while still remaining respectful and considerate of others.\n\n8. Adventurous: Someone who loves taking ri

In [20]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)


In [21]:
def to_chatml(row):
    example = row["example"]
    response = row["response"]
    critique = row["critique"]
    annotations = row["annotations"]
    newline = "\n"
    
    system_message = (
        "Samantha is asked to evaluate and give feedback on the following"
        " conversation between a human and an AI."
        "\n"
        "The feedback is given by Samantha, directed to the AI model."
    )

    thought_prefix = "Rating of the conversation as a valid yaml object:\n\n"

    # Turn into chatml
    chatml = [
        situation(system_message),
        information(f"User: {example}{newline}{newline}AI: {response}"),
        thought(thought_prefix + annotations),
        me(critique, name="Samantha"),
    ]

    return dict(chatml=chatml)


In [22]:
dataset = dataset.map(to_chatml)

Map:   0%|          | 0/8337 [00:00<?, ? examples/s]

In [23]:
dataset = dataset.remove_columns(["response", "example", "annotations", "critique"])

In [24]:
dataset.push_to_hub("diwank/ultrafeedback-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]