In [6]:
# Seed examples for task bootstrapping
tasks_with_difficulty = {
    # lewis
    "Evaluate models {M_i} on benchmarks {B_i}": "Easy",
    "Train models {M_i} on datasets {D_i} with benchmarks {B_i}": "Medium",
    "Run an ablation for hyperparameter P for model M on dataset D": "Hard",
    "Generate completions with model M on dataset D using engine E": "Medium",
    "Merge models {M_i} using linear averaging to find the best result on benchmarks {B_i}": "Hard",
    "Given datasets {D_i}, ablate the best SFT mixture for model M across benchmarks {B_i}": "Very hard",
    "Decontaminate dataset D against benchmarks {B_i}": "Hard",
    "Benchmark RL framework F for best throughput on G GPUs": "Very hard",
    "Implement post-training algorithm A from paper P in framework F. Validate it runs end-to-end": "Very hard",
    "Implement benchmark B in framework F. Validate it reproduces some published results": "Very hard",
    "Format dataset D for compatibility with framework F on task T": "Easy",

    # abubakar
    "Remove the background from this image: [image path]": "Easy",
    "Transcribe all of the audio files in this directory": "Easy",
    "Transcribe all of the audio files in this directory, choose the model that'll be cheapest and also relatively accurate": "Medium (judgment call or interaction needed to figure out what accuracy levels are acceptable)",
    "Remove the background music from this audio file": "Medium (needs to find Gradio Space and call its API0",
    "Change this video track to be from English to Spanish": "Medium (needs to link several models together)",
    "Translate this flyer from English to Spanish, keeping the layout and images the same": "Medium (needs to link several models together)",

    # leandro
    "What's the best model for X?": "Easy",
    "What datasets are available for X? (X={domain x task x modality})": "Easy",
    "Is there a space to do Y?": "Easy",
    "I have this script and this error - what's the issue?": "Medium",
    "This space is broken, how can i fix it?": "Medium",
    "I built a space but it is super slow. What can I do?": "Medium",
    "How can I run modal X locally?": "Medium",
    "I want to build a space with model Y to do X?": "Hard",
    "How can I serve a model with multiple LoRAs?": "Hard",

    # claude
    "What's the best model for sentiment analysis on financial text?": "Easy",
    "Are there any medical image segmentation datasets on HuggingFace for CT scans?": "Easy",
    "Which text classification models support 4-bit quantization?": "Medium",
    "Are there inference endpoints available for Whisper large-v3?": "Easy",
    "What's the license for the SA-Med2D-20M dataset?": "Easy",
    "Which vision models fit in 8GB VRAM for image segmentation?": "Medium",
    "What datasets are available for 3D medical image segmentation?": "Medium",
    "Is there a space to do text-to-speech with emotion control?": "Medium",
    "I'm getting \"CUDA out of memory\" when loading Llama-2-7b even though nvidia-smi shows I have 6GB free - what's the issue?": "Medium",
    "My Gradio space shows \"Connection errored out\" after working fine yesterday, no code changes - how can I fix it?": "Medium",
    "I built a Gradio space for Stable Diffusion but inference takes 5+ minutes on a 4090 - what can I do?": "Medium",
    "My Whisper model outputs different transcriptions after quantization to int8 - why?": "Medium",
    "Getting \"RuntimeError: CUDA error: out of memory. Tried to allocate 70.00 MiB\" but only 2.87 GiB is allocated - what's happening?": "Medium",
    "My HuggingFace space build fails with \"failed to create containerd task\" - how to fix?": "Medium",
    "DistilBERT model gives \"you should probably train your model\" warning even though it's a pretrained model from the Hub": "Easy",
    "Space was working fine but now receiving build errors - receiving this error even with a new space": "Medium",
    "Inference is correct locally but wrong on deployed space": "Medium",
    "Getting CUDA OOM despite having enough memory according to nvidia-smi": "Medium",
    "How can I run Mistral-7B-v0.1 locally with multiple LoRA adapters?": "Hard",
    "How can I serve Llama-2-7b with vLLM and dynamically load multiple LoRA adapters?": "Hard",
    "How do I batch inference requests in my Gradio space for better throughput?": "Medium",
    "Can I run Whisper large-v3 with faster-whisper for 4x speedup?": "Medium",
    "How to run Llama 2 on CPU after fine-tuning with LoRA?": "Medium",
    "Best way to handle 50+ concurrent requests in a Gradio space without OOM?": "Hard",
    "How do I add custom stopping criteria for text generation with Transformers?": "Hard",
    "Can I merge multiple LoRA adapters before inference to reduce latency?": "Hard",
    "How can I optimize my LLM inference with one base LLM and multiple LoRA adapters?": "Hard",
}


In [7]:
len(tasks_with_difficulty)

53

In [None]:
import litellm
import json
from pydantic import BaseModel
from enum import Enum


class Difficulty(str, Enum):
    EASY = "Easy"
    MEDIUM = "Medium"
    HARD = "Hard"
    VERY_HARD = "Very hard"


class Task(BaseModel):
    description: str
    difficulty: Difficulty


class GeneratedTasks(BaseModel):
    tasks: list[Task]


def build_prompt(tasks_dict: dict[str, str]) -> str:
    task_descriptions = "".join(
        [f'- "{task}" [{difficulty}]\n' for task, difficulty in tasks_dict.items()]
    )

    return f"""Given the following examples of tasks (with their estimated difficulty levels in brackets):

{task_descriptions}

Generate exactly 10 new unique tasks with their difficulty levels (Easy, Medium, Hard, or Very hard).
The new tasks should be bootstrapped by analogy or creative mutation of the provided ones, but not be direct copies.
Vary the domains, instructions, and scenario details. Write crisp, concrete task phrasing. Preserve variety in both tasks and difficulties.
Do not repeat any of the input tasks verbatim. Create plausible, meaningful tasks relevant to LLM training, evaluation, dataprocessing, issue handling, tooling, etc.
"""



In [10]:
model_name = "gpt-5"

# Number of iterations to generate tasks (10 tasks per iteration)
num_iterations = 20

# Copy the seed tasks to avoid modifying the original
all_tasks = tasks_with_difficulty.copy()

for i in range(num_iterations):
    prompt = build_prompt(all_tasks)

    # Query LLM using litellm with structured output
    response = litellm.completion(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": "You are an expert at generating diverse ML/AI task instructions using products from HuggingFace and can enumerate them with proper difficulty.",
            },
            {"role": "user", "content": prompt},
        ],
        response_format=GeneratedTasks,
    )

    # Parse the structured output
    generated = GeneratedTasks.model_validate_json(
        response.choices[0].message.content
    )

    # Add new tasks to the dictionary
    new_count = 0
    for task in generated.tasks:
        if task.description not in all_tasks:
            all_tasks[task.description] = task.difficulty.value
            new_count += 1

    print(f"Iteration {i + 1}/{num_iterations}: Added {new_count} new tasks. Total: {len(all_tasks)}")

# Save to disk
with open("generated_tasks_with_difficulty.json", "w") as f:
    json.dump(all_tasks, f, indent=2)

print(f"\nFinal task count: {len(all_tasks)}")


Iteration 1/20: Added 10 new tasks. Total: 63
Iteration 2/20: Added 10 new tasks. Total: 73
Iteration 3/20: Added 10 new tasks. Total: 83
Iteration 4/20: Added 10 new tasks. Total: 93
Iteration 5/20: Added 10 new tasks. Total: 103
Iteration 6/20: Added 10 new tasks. Total: 113
Iteration 7/20: Added 10 new tasks. Total: 123
Iteration 8/20: Added 10 new tasks. Total: 133
Iteration 9/20: Added 10 new tasks. Total: 143
Iteration 10/20: Added 10 new tasks. Total: 153
Iteration 11/20: Added 10 new tasks. Total: 163
Iteration 12/20: Added 10 new tasks. Total: 173
Iteration 13/20: Added 10 new tasks. Total: 183
Iteration 14/20: Added 10 new tasks. Total: 193
Iteration 15/20: Added 10 new tasks. Total: 203
Iteration 16/20: Added 10 new tasks. Total: 213
Iteration 17/20: Added 10 new tasks. Total: 223
Iteration 18/20: Added 10 new tasks. Total: 233
Iteration 19/20: Added 10 new tasks. Total: 243
Iteration 20/20: Added 10 new tasks. Total: 253

Final task count: 253


In [16]:
from datasets import Dataset

# Convert dict to proper columns
questions = list(all_tasks.keys())
difficulties = list(all_tasks.values())
data = {"question": questions, "difficulty": difficulties}

dataset = Dataset.from_dict(data)
print(f"\nDataset: {len(dataset)} rows")
print(f"Sample: {dataset[0]['question']} ({dataset[0]['difficulty']})")



Dataset: 253 rows
Sample: Evaluate models {M_i} on benchmarks {B_i} (Easy)


In [17]:
dataset.push_to_hub("akseljoonas/benchmark-tasks", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/benchmark-tasks/commit/a96debee2c67ef760ecaea69296f2059f449fad6', commit_message='Upload dataset', commit_description='', oid='a96debee2c67ef760ecaea69296f2059f449fad6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/benchmark-tasks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/benchmark-tasks'), pr_revision=None, pr_num=None)