In [1]:
# 2025/7/17
# zhangzhong
# https://huggingface.co/docs/datasets/about_arrow

In [2]:
# Arrow
# https://arrow.apache.org/
# It is a specific data format that stores data in a columnar memory layout
# 
# Arrow’s standard format allows zero-copy reads which removes virtually all serialization overhead.
# Arrow is language-agnostic so it supports different programming languages.
# Arrow is column-oriented so it is faster at querying and processing slices or columns of data.
# Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.
# Arrow supports many, possibly nested, column types.

In [3]:
# Memory mapping
# 🤗 Datasets uses Arrow for its local caching system.
# t allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup. 
# This architecture allows for large datasets to be used on machines with relatively small device memory.
# For example, loading the full English Wikipedia dataset only takes a few MB of RAM

# import os
# import psutil
# from datasets import load_dataset

# mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
# wiki = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
# mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

# # This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory
# # Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.
# print(f"RAM memory used: {(mem_after - mem_before)} MB")

In [4]:
# import timeit
# s = """batch_size = 1000
# for batch in wiki.iter(batch_size):
#     ...
# """

# elapsed_time = timeit.timeit(stmt=s, number=1, globals=globals())
# # wiki.dataset_size是数据集的大小，以字节为单位
# # >>30 表示右移30位，相当于除以2的30次方，即转换为GB
# # >>27 表示右移27位，相当于除以2的27次方，计算的是Gbit 因为GByte-> Gbit 需要乘以8
# print(f"Time to iterate over the {wiki.dataset_size >> 30} GB dataset: {elapsed_time:.1f} sec, "
#       f"ie. {float(wiki.dataset_size >> 27)/elapsed_time:.1f} Gb/s")

In [5]:
# The cache
# It stores previously downloaded and processed datasets so when you need to use them again, they are reloaded directly from the cache
#  Even after you close and start another Python session

In [6]:
# Fingerprint
# Datasets assigns a fingerprint to the cache file. 
# A fingerprint keeps track of the current state of a dataset.
# The initial fingerprint is computed using a hash from the Arrow table, or a hash of the Arrow files if the dataset is on disk.
# Subsequent fingerprints are computed by combining the fingerprint of the previous state, and a hash of the latest transform applied.

# In order for a transform to be hashable, it needs to be picklable by dill or pickle.
# https://dill.readthedocs.io/en/latest/
# https://docs.python.org/3/library/pickle.html
from datasets import Dataset, IterableDataset
dataset1 = Dataset.from_dict({"a": [0, 1, 2]})
dataset2 = dataset1.map(lambda x: {"a": x["a"] + 1})
print(dataset1._fingerprint, dataset2._fingerprint)

# If your transform is not hashable, Dataset will recompute the dataset every time.

# When caching is disabled, use Dataset.save_to_disk() to save your transformed dataset or it will be deleted once the session ends.

# The hash is computed by dumping the object using a dill pickler and hashing the dumped bytes.

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

889f7244aaee43a3 7978cbad9f40e9bb


In [7]:
# Differences between Dataset and IterableDataset
# https://huggingface.co/docs/datasets/about_mapstyle_vs_iterable

# IterableDataset is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages
# while a Dataset is great for everything else

# Dataset provides random access to the rows, but you must have the entire dataset stored on your disk or in memory,
#
# IterableDataset, you can access it using a for loop to load the data progressively as you iterate over the dataset. 
# This way, only a small fraction of examples is loaded in memory, and you don’t write anything on disk.


In [8]:
# You can create a Dataset using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:
my_dataset = Dataset.from_dict({"col_1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
print(my_dataset[0])

# Lazy
# use generator to create a iterable dataset
def my_generator(n):
    for i in range(n):
        yield {"col_1": i}

my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={"n": 10})
for example in my_iterable_dataset:
    print(example)
    break

{'col_1': 0}
{'col_1': 0}


In [9]:
# Loading local files entirely and progressively

# To save disk space and skip the conversion step, you can define an IterableDataset by streaming from the local files directly.
# the data is read progressively from the local files as you iterate over the dataset:

# data_files = {"train": ["path/to/data.csv"]}
# my_iterable_dataset = load_dataset("csv", data_files=data_files, split="train", streaming=True)
# for example in my_iterable_dataset:  # this reads the CSV file progressively as you iterate over the dataset
#     print(example)
#     break

In [None]:
# Eager data processing
# When you process a Dataset object using Dataset.map(), the entire dataset is processed immediately and returned.
# my_dataset = my_dataset.map(process_fn)  # process_fn is applied on all the examples of the dataset
# print(my_dataset[0])

# Lazy data processing
# On the other hand, due to the “lazy” nature of an IterableDataset,
# calling IterableDataset.map() does not apply your map function over the full dataset. 
# Instead, your map function is applied on-the-fly.

# you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:
# my_iterable_dataset = my_iterable_dataset.map(process_fn_1)
# my_iterable_dataset = my_iterable_dataset.filter(filter_fn)
# my_iterable_dataset = my_iterable_dataset.map(process_fn_2)

# # process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset
# for example in my_iterable_dataset:  
#     print(example)
#     break

In [None]:
# Exact suffling
# When you shuffle a Dataset using Dataset.shuffle(), you apply an exact shuffling of the dataset. 
# It works by taking a list of indices [0, 1, 2, ... len(my_dataset) - 1] and shuffling this list.
# Then, accessing my_dataset[0] returns the row and index defined by the first element of the indices mapping that has been shuffled:

# Approximate shuffling
# It uses a shuffle buffer to sample random examples iteratively from the dataset
# shuffles the dataset shards if your dataset is made of multiple files or sources:

In [None]:
# Speed

# However as soon as your Dataset has an indices mapping (via Dataset.shuffle() for example), the speed can become 10x slower.
# you aren’t reading contiguous chunks of data anymore
# To restore the speed, you’d need to rewrite the entire dataset on your disk again using Dataset.flatten_indices(), which removes the indices mapping.
# my_dataset = my_dataset.flatten_indices()  # rewrite the shuffled dataset on disk as contiguous chunks of data

# IterableDataset.shuffle().
#  It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal.

In [None]:
# resume

# To restart the iteration of a map-style dataset, you can simply skip the first examples:
# my_dataset = my_dataset.select(range(start_index, len(dataset)))

# On the other hand, iterable datasets don’t provide random access to a specific example index to resume from. 
# But you can use IterableDataset.state_dict() and IterableDataset.load_state_dict() to resume from a checkpoint instead, 
# similarly to what you can do for models and optimizers:
# 
# iterable_dataset = Dataset.from_dict({"a": range(6)}).to_iterable_dataset(num_shards=3)
# # save in the middle of training
# state_dict = iterable_dataset.state_dict()
# # and resume later
# iterable_dataset.load_state_dict(state_dict)
# 
# Under the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard
# and it stores this info in the state_dict.
# To resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. 
# Then it reads the shard and skips examples until it reaches the exact example from the checkpoint.
# This can be used with the **StatefulDataLoader** from torchdata



In [None]:
# Switch from map-style to iterable-style
# iterable + shards + torch.dataloader?
# my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=1024)
# my_iterable_dataset.num_shards  # 1024

In [None]:
# Dataset Features
# https://huggingface.co/docs/datasets/about_dataset_features

# Features defines the internal structure of a dataset.
# The Features format is simple: dict[column_name, column_type].

In [None]:
# Build and Load
# https://huggingface.co/docs/datasets/about_dataset_load

In [None]:
# Batch Mapping
# It allows you to speed up processing, 
# and freely control the size of the generated dataset.

# Input size != output size
#  the mapped function does not have to return an output batch of the same size.
# However, remember that all values in the output dictionary must contain the same number of elements as the other fields in the output dictionary. 
# To make it valid, you have to drop one of the columns:
# dataset_with_duplicates = dataset.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=dataset.column_names, batched=True)