In [12]:
from pathlib import Path
from typing import Iterable
from datasets import Dataset

In [13]:
def create_dataset(
    data: Iterable[tuple[str, str]], source_lang: str, target_lang: str
):
    def gen():
        for source, target in data:
            yield {
                "source_lang": source_lang,
                "target_lang": target_lang,
                "source_text": source,
                "target_text": target,
            }

    dataset = Dataset.from_generator(gen)
    # also see Dataset.from_dict, and other from_* methods
    return dataset

In [14]:
def read_lines(path: Path):
    for line in path.open("r", encoding="utf-8"):
        yield line.strip()

eng = list(read_lines(Path("data/small.eng")))
ces = list(read_lines(Path("data/small.ces")))

dataset = create_dataset(zip(eng, ces), "English", "Czech")

In [15]:
name = "npfl087-demo-small"
dataset.save_to_disk(f"~/datasets/{name}")
dataset.push_to_hub(f"hrabalm/{name}", private=True)

Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 66785.09 examples/s]
Creating parquet from Arrow format: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 390.57ba/s]
Uploading the dataset shards: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.93it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/hrabalm/npfl087-demo-small/commit/5507b5e3b7c94cbffb052f35e44a6b6f1ccef646', commit_message='Upload dataset', commit_description='', oid='5507b5e3b7c94cbffb052f35e44a6b6f1ccef646', pr_url=None, pr_revision=None, pr_num=None)