In [1]:
# 2025/7/17
# zhangzhong
# https://huggingface.co/docs/datasets/stream

In [2]:
# Dataset streaming lets you work with a dataset without downloading it. The data is streamed as you iterate over the dataset. This is especially helpful when:

# - You don’t want to wait for an extremely large dataset to download.
# - The dataset size exceeds the amount of available disk space on your computer.
# - You want to quickly explore just a few samples of a dataset.

In [19]:
from datasets import load_dataset, Dataset, IterableDataset, interleave_datasets
import torch
from pprint import pprint
from transformers import AutoTokenizer

In [4]:
dataset: IterableDataset = load_dataset(path='HuggingFaceFW/fineweb', split='train', streaming=True)
print(next(iter(dataset)))

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

{'text': "How AP reported in all formats from tornado-stricken regionsMarch 8, 2012\nWhen the first serious bout of tornadoes of 2012 blew through middle America in the middle of the night, they touched down in places hours from any AP bureau. Our closest video journalist was Chicago-based Robert Ray, who dropped his plans to travel to Georgia for Super Tuesday, booked several flights to the cities closest to the strikes and headed for the airport. He’d decide once there which flight to take.\nHe never got on board a plane. Instead, he ended up driving toward Harrisburg, Ill., where initial reports suggested a town was destroyed. That decision turned out to be a lucky break for the AP. Twice.\nRay was among the first journalists to arrive and he confirmed those reports -- in all formats. He shot powerful video, put victims on the phone with AP Radio and played back sound to an editor who transcribed the interviews and put the material on text wires. He then walked around the devastatio

In [5]:
# Dataset streaming also lets you work with a dataset made of local files without doing any conversion.
# In this case, the data is streamed from the local files as you iterate over the dataset

# - You don’t want to wait for an extremely large local dataset to be converted to Arrow.
# - The converted files size would exceed the amount of available disk space on your computer.
# - You want to quickly explore just a few samples of a dataset.

# from datasets import load_dataset
# data_files = {'train': 'path/to/OSCAR-2201/compressed/en_meta/*.jsonl.gz'}
# dataset = load_dataset('json', data_files=data_files, split='train', streaming=True)
# print(next(iter(dataset)))

In [6]:
# Column indexing

dataset: IterableDataset = load_dataset(path="allenai/c4", name="en", streaming=True, split="train")

print(next(iter(dataset["text"])))

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.
The cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.


In [7]:
# Convert from a Dataset
# If you have an existing Dataset object, you can convert it to an IterableDataset with the to_iterable_dataset() function.
# This is actually **faster** than setting the streaming=True argument in load_dataset() because the data is streamed from local files.

# faster
# dataset = load_dataset("ethz/food101")
# iterable_dataset = dataset.to_iterable_dataset()

# slower
# iterable_dataset = load_dataset("ethz/food101", streaming=True)

# 这一段跑的太慢了。。。
# # !!! Shard with IterableDataset
# # The to_iterable_dataset() function supports sharding when the IterableDataset is instantiated. 
# # This is useful when working with big datasets, and you’d like to shuffle the dataset or to enable fast parallel loading with a PyTorch DataLoader.
# dataset: Dataset = load_dataset(path="allenai/c4", name="en", split="train") 
# # shard the dataset
# iterable_dataset: IterableDataset = dataset.to_iterable_dataset(num_shards=64)
# # shuffles the shards order and use a shuffle buffer when you start iterating
# iterable_dataset = iterable_dataset.shuffle(buffer_size=10000) 
# # assigns 64 / 4 = 16 shards from the shuffled list of shards to each worker when you start iterating
# dataloader = torch.utils.data.DataLoader(iterable_dataset, num_workers=4)

In [8]:
# shuffle
# The buffer_size argument controls the size of the buffer to randomly sample examples from.
# Let’s say your dataset has one million examples, and you set the buffer_size to ten thousand. 
# IterableDataset.shuffle() will randomly select examples from the first ten thousand examples in the buffer. 
# Selected examples in the buffer are replaced with new examples.
# By default, the buffer size is 1,000.
# IterableDataset.shuffle() will also shuffle the order of the shards if the dataset is sharded into multiple files.

dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)
shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10_000)

# Reshuffle
# reshuffle between epoch, Use IterableDataset.set_epoch()
# shuffled_dataset.set_epoch(epoch)

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

In [9]:
# take and skip
# IterableDataset.take() returns the first n examples in a dataset:
# IterableDataset.skip() omits the first n examples in a dataset and returns the remaining examples:
# 
# take and skip prevent future calls to shuffle because they lock in the order of the shards.
# You should shuffle your dataset before splitting it.
dataset_head = dataset.take(2)
train_dataset = shuffled_dataset.skip(1000)


In [10]:
# Shard
# Datasets supports sharding to divide a very large dataset into a predefined number of chunks
# num_shards: Specify the num_shards parameter in shard() to determine the number of shards to split the dataset into
# index: You’ll also need to provide the shard you want to return with the index parameter.
dataset = load_dataset(path="amazon_polarity", split="train", streaming=True)
print(dataset)
# After sharding the dataset into two chunks, the first one will only have 2 shards:
dataset = dataset.shard(num_shards=2, index=0)
print(dataset)

# ！！！
# If your dataset has dataset.num_shards==1, you should chunk it using IterableDataset.skip() and IterableDataset.take() instead.

README.md: 0.00B [00:00, ?B/s]

IterableDataset({
    features: ['label', 'title', 'content'],
    num_shards: 4
})
IterableDataset({
    features: ['label', 'title', 'content'],
    num_shards: 2
})


In [16]:
# ！！！
# Interleave
# interleave_datasets() can combine an IterableDataset with other datasets. T
# The combined dataset returns alternating examples from each of the original datasets.
# Define sampling probabilities from each of the original datasets for more control over how each of them are sampled and combined. Set the probabilities argument with your desired sampling probabilities:
# stopping_strategy: first_exhausted, all_exhausted

es_dataset = load_dataset(path="allenai/c4", name="es", split="train", streaming=True)
fr_dataset = load_dataset(path="allenai/c4", name="fr", split="train", streaming=True)

multilingual_dataset = interleave_datasets(
    datasets=[es_dataset, fr_dataset],
    probabilities=[0.8, 0.2],
    seed=42,
    stopping_strategy="all_exhausted",
)
pprint(list(multilingual_dataset.take(5)), compact=True)

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2048 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/2048 [00:00<?, ?it/s]

[{'text': 'Comprar Zapatillas para niña en chancla con goma por detrás '
          'Gioseppo en rosa online\n'
          '>Zapatillas para niña en chancla con goma por detrás Gioseppo en '
          'rosa\n'
          'Zapatillas de estar en casa niña de otoño/invierno. Zapatillas para '
          'niña en chancla con goma por detrás Gioseppo en rosa modelo '
          '85612-Alice. De paño y con suela de tela.Muy cómodas debido a la '
          'sujección de la goma. Viene con una mochila de regalo.\n'
          'Numeración disponible del 24 al 30.\n'
          'Referencia: 85612-Alice\n'
          '8,25 €€ 11,00 € -25%',
  'timestamp': datetime.datetime(2019, 1, 18, 17, 11, 30),
  'url': 'https://www.calzadoslabalear.com/es/zapatillas-mujer/136-comprar-Zapatillas-para-nina-en-chancla-con-goma-por-detras-Gioseppo-en-rosa-online.html'},
 {'text': 'Chevrolet Cavalier Usados en Bogota - Carros en Venta\n'
          'Búsquedas Relacionadas: Chevrolet Cavalier , Chevrolet Cavalier '
      

In [None]:
# rename, remove, cast

In [17]:
# map
# IterableDataset.map() applies processing on-the-fly when examples are streamed.
# It allows you to apply a processing function to each example in a dataset, independently or in batches. 
# This function can even create new rows and columns.

def add_prefix(example):
    example["text"] = 'My text: ' + example['text']
    return example

dataset = load_dataset(path='allenai/c4', name='en', streaming=True, split='train')
updated_dataset = dataset.map(add_prefix)
pprint(list(updated_dataset.take(3)), compact=True)

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

[{'text': 'My text: Beginners BBQ Class Taking Place in Missoula!\n'
          'Do you want to get better at making delicious BBQ? You will have '
          'the opportunity, put this on your calendar now. Thursday, September '
          '22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke '
          'Rangers. He will be teaching a beginner level class for everyone '
          'who wants to get better with their culinary skills.\n'
          'He will teach you everything you need to know to compete in a KCBS '
          'BBQ competition, including techniques, recipes, timelines, meat '
          'selection and trimming, plus smoker and fire information.\n'
          'The cost to be in the class is $35 per person, and for spectators '
          'it is free. Included in the cost will be either a t-shirt or apron '
          'and you will be tasting samples of each meat that is prepared.',
  'timestamp': '2019-04-25 12:57:54',
  'url': 'https://klyq.com/beginners-bbq-class

In [21]:
# Tokenization
dataset = load_dataset("allenai/c4", "en", streaming=True, split="train")
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')
dataset = dataset.map(encode, batched=True, remove_columns=["text", "timestamp", "url"])
pprint(next(iter(dataset)), compact=True)

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0,

In [None]:
# Batch, not map(batched=True)
# The batch method transforms your IterableDataset into an iterable of batches
# This batching is done on-the-fly as you iterate over the dataset, preserving the memory-efficient nature of IterableDataset.
# The batch method also provides a **drop_last_batch** parameter. When set to True, it will discard the last batch if it’s smaller than the specified batch_size.

# Load a dataset in streaming mode
dataset = load_dataset("allenai/c4", "en" , split="train", streaming=True)

# Create batches of 32 samples
batched_dataset = dataset.batch(batch_size=32, drop_last_batch=True)

# Iterate over the batched dataset
for batch in batched_dataset:
    pprint(batch, compact=True)
    break

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

{'text': ['Beginners BBQ Class Taking Place in Missoula!\n'
          'Do you want to get better at making delicious BBQ? You will have '
          'the opportunity, put this on your calendar now. Thursday, September '
          '22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke '
          'Rangers. He will be teaching a beginner level class for everyone '
          'who wants to get better with their culinary skills.\n'
          'He will teach you everything you need to know to compete in a KCBS '
          'BBQ competition, including techniques, recipes, timelines, meat '
          'selection and trimming, plus smoker and fire information.\n'
          'The cost to be in the class is $35 per person, and for spectators '
          'it is free. Included in the cost will be either a t-shirt or apron '
          'and you will be tasting samples of each meat that is prepared.',
          "Discussion in 'Mac OS X Lion (10.7)' started by axboi87, Jan 20, "
          '2012

In [None]:
# ！https://huggingface.co/docs/datasets/stream#stream-in-a-training-loop
# 一个training loop example

In [None]:
# Save a dataset checkpoint and resume iteration
# If your training loop stops, you may want to restart the training from where it was
# state_dict = iterable_dataset.state_dict()
# store and load state_dict from pickle?
# iterable_dataset.load_state_dict(state_dict)

# skip shard
# but in a shard, it must read through the checkpoint

In [None]:
# Save and Export
# If the dataset consists of multiple shards (dataset.num_shards > 1), you can use multiple processes to upload it in parallel.
# push_to_hub(xxx, num_proc=8)

# Export
# dataset.to_csv()
# to_xxx