# Distilabel Dataset Generation (B)

### Installation

Using a Python 3.11.11 Kernel

In [None]:
!uv venv --seed --system-site-packages
!uv init --bare --name ml-distilabel

In [None]:
%%capture
!uv add ipywidgets ipykernel python-dotenv

Change the kernel to use an existing `.venv` to pick up the one you've created with `uv` above.

In [1]:
import importlib.metadata

print("ipywidgets: " + importlib.metadata.version("ipywidgets"))
print("python-dotenv: " + importlib.metadata.version("python-dotenv"))

ipywidgets: 8.1.5
python-dotenv: 1.1.0


In [None]:
%%capture
!uv add distilabel[ollama] beautifulsoup4 pandas Pillow

In [2]:
import importlib.metadata

print("distilabel: " + importlib.metadata.version("distilabel"))
print("beautifulsoup4: " + importlib.metadata.version("beautifulsoup4"))
print("pandas: " + importlib.metadata.version("pandas"))
print("Pillow: " + importlib.metadata.version("Pillow"))

distilabel: 1.5.3
beautifulsoup4: 4.13.3
pandas: 2.2.3
Pillow: 11.1.0


In [4]:
from dotenv import load_dotenv
import os

load_dotenv()

True

### Data Prep
<a name="Data"></a>

In [None]:
from distilabel.pipeline import Pipeline
from distilabel.llms import OllamaLLM
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration

In [29]:

# Configuration
OLLAMA_HOST_URL = "host.docker.internal" # Lets you access Ollama on the host from inside a devcontainer
OLLAMA_MODEL = "llama3.2:3b-instruct-fp16"  # Change to your desired model (e.g., "mistral")

print(f"Attempting to connect to Ollama at: {OLLAMA_HOST_URL} using model: {OLLAMA_MODEL}")
try:
    ollama_llm = OllamaLLM(
        model=OLLAMA_MODEL,
        host=OLLAMA_HOST_URL,
    )
    print("OllamaLLM object created successfully.")
except Exception as e:
    print(f"Error creating OllamaLLM object: {e}")
    exit(1)


Attempting to connect to Ollama at: host.docker.internal using model: llama3.2:3b-instruct-fp16
OllamaLLM object created successfully.


Returning anything other than `self` from a top level model validator isn't supported when validating via `__init__`.
See the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.
  ollama_llm = OllamaLLM(


In [35]:
prompts = [
    {"instruction": "Write a short sentence about a happy cat."},
    {"instruction": "Explain the concept of gravity in one sentence."},
    {"instruction": "What is the capital of France?"},
]

In [36]:
with Pipeline(name="ollama-docker-demo", description="Demo using host Ollama from Docker") as pipeline:
    # Load the data into the pipeline
    data = LoadDataFromDicts(data=prompts)
    # data = LoadDataFromDicts(data=[{"instruction": "Generate a short question about Uplimit."}])

    gen_a = TextGeneration(llm=ollama_llm, output_mappings={"generation": "instruction"})
    gen_b = TextGeneration(llm=ollama_llm, output_mappings={"generation": "response"})

    data >> gen_a >> gen_b

Please ensure the model as defined in `OLLAMA_MODEL` has been pulled to your Ollama server prior to running the next step or it will fail.

In [37]:
print("\nStarting pipeline run...")
try:
    distiset = pipeline.run(use_cache=False) # Parameters like `use_cache=False` can be added
    print("\nPipeline run completed successfully!")
    print("\nGenerated Data:")
    print(distiset)

except Exception as e:
    print(f"\nPipeline run failed: {e}")
    print("\nTroubleshooting Tips:")
    print(f" - Is Ollama running on the host? Try: curl {OLLAMA_HOST_URL.replace('host.docker.internal', 'localhost')}")
    print(f" - Is the model '{OLLAMA_MODEL}' pulled? Try: ollama pull {OLLAMA_MODEL}")
    print(" - Is there a firewall blocking port 11434 on the host?")
    print(" - Does your Docker version support 'host.docker.internal' correctly? (Most modern versions do)")
    print(" - If on Linux, did you run the container with '--add-host=host.docker.internal:host-gateway'?")


Starting pipeline run...


Generating train split: 0 examples [00:00, ? examples/s]


Pipeline run completed successfully!

Generated Data:
Distiset({
    default: DatasetDict({
        train: Dataset({
            features: ['instruction', 'distilabel_metadata', 'model_name', 'response'],
            num_rows: 3
        })
    })
})


In [None]:
distiset.push_to_hub(
    "johnmccabe/ollama_test",
    commit_message="Multiple prompts",
    private=False,
    token=os.getenv("HF_TOKEN"),
    generate_card=True,
    include_script=False
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/logging/handlers.py", line 1492, in emit
    self.enqueue(self.prepare(record))
  File "/usr/local/lib/python3.11/logging/handlers.py", line 1450, in enqueue
    self.queue.put_nowait(record)
  File "/usr/local/lib/python3.11/multiprocessing/queues.py", line 138, in put_nowait
    return self.put(obj, False)
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/multiprocessing/queues.py", line 88, in put
    raise ValueError(f"Queue {self!r} is closed")
ValueError: Queue <multiprocessing.queues.Queue object at 0x7f9c387bc1d0> is closed
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/workspaces/ml-distilabel/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/work