In [None]:
!pip install -q datasets

In [None]:
from datasets import load_dataset
import json

n_prompts = 100

def save_jsonl(filename, items):
  with open(filename, "w") as f:
    for item in items:
      f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
# Chat prompts
# https://huggingface.co/datasets/OpenAssistant/oasst1
# https://github.com/LAION-AI/Open-Assistant/tree/main/oasst-data

oasst = load_dataset("OpenAssistant/oasst1", split="validation")

def extract_conversations(messages, num_conv=100):
  conversations = []

  for i, root in enumerate(msg for msg in messages if msg["parent_id"] is None and msg["lang"] == "en"):
    current = root

    conversation = []

    while True:
      conversation.append({
          "role": "user" if current["role"] == "prompter" else "assistant",
          "content": current["text"]})

      try:
        current = next(msg for msg in messages if msg["parent_id"] == current["message_id"])
      except StopIteration:
        break

    conversations.append(conversation[:-1])

    if i == num_conv - 1:
      break

  return conversations

conversations = extract_conversations(oasst)

In [None]:
chat_payloads = []
completion_payloads = []

for conversation in conversations:
  chat_payloads.append({
    "messages": conversation
  })

  prompt = "\n".join([
    f"User: {msg['content']}" if msg["role"] == "user" else
    f"Assistant: {msg['content']}"
    for msg in conversation])

  completion_payloads.append({
    "prompt": prompt + "\nAssistant:"
  })

save_jsonl("chat_oasst1_chat.jsonl", chat_payloads)
save_jsonl("chat_oasst1_completion.jsonl", completion_payloads)

In [None]:
# Summarization prompts
# https://huggingface.co/datasets/abisee/cnn_dailymail

cnn = load_dataset("cnn_dailymail", "3.0.0", split="validation")

chat_payloads = []
completion_payloads = []

for item in cnn.select(range(n_prompts)):
  article = item["article"]

  chat_payloads.append({
    "messages": [
        {"role": "user", "content": f"Summarize the following article:\n\n{article}"}
    ]
  })

  completion_payloads.append({
    "prompt": f"Summarize the following article:\n\n{article}\n\nSummary:"
  })

save_jsonl("summarization_cnn_chat.jsonl", chat_payloads)
save_jsonl("summarization_cnn_completion.jsonl", completion_payloads)

In [None]:
# QA prompts
# https://huggingface.co/datasets/mandarjoshi/trivia_qa

trivia = load_dataset("trivia_qa", "unfiltered", split="validation")

chat_payloads = []
completion_payloads = []

for item in trivia.select(range(n_prompts)):
  question = item["question"]

  chat_payloads.append({
    "messages": [
      {"role": "user", "content": question}
    ]
  })

  prompt = f"Q: {question}\nA:"
  completion_payloads.append({
    "prompt": prompt
  })

save_jsonl("qa_triviaqa_chat.jsonl", chat_payloads)
save_jsonl("qa_triviaqa_completion.jsonl", completion_payloads)

In [None]:
# Code generation prompts
# https://huggingface.co/datasets/openai/openai_humaneval

humaneval = load_dataset("openai_humaneval", split="test")

chat_payloads = []
completion_payloads = []

for item in humaneval.select(range(n_prompts)):
  prompt_code = item["prompt"]

  chat_payloads.append({
    "messages": [
      {"role": "user", "content": f"Write a Python function based on the following signature:\n\n{prompt_code}"}
    ]
  })

  completion_payloads.append({
    "model": "your-completion-model",
    "prompt": f"Write a Python function based on the following signature:\n\n{prompt_code}\n"
  })

save_jsonl("code_humaneval_chat.jsonl", chat_payloads)
save_jsonl("code_humaneval_completion.jsonl", completion_payloads)