In [1]:
# 2025/7/17
# zhangzhong
# https://huggingface.co/docs/datasets/use_with_pytorch

In [2]:
# By default, datasets return regular python objects: integers, floats, strings, lists, etc.
# To get PyTorch tensors instead, you can set the format of the dataset to pytorch using Dataset.with_format():

# A Dataset object is a wrapper of an Arrow table, which allows fast zero-copy reads from arrays in the dataset to PyTorch tensors.

from datasets import Dataset

data = [[1, 2], [3, 4]]
ds = Dataset.from_dict({"data": data})
print(ds[0])
ds = ds.with_format("torch")
print(ds[0])


{'data': [1, 2]}
{'data': tensor([1, 2])}


In [3]:
# To load the data as tensors on a GPU, specify the device argument:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ds = ds.with_format("torch", device=device)
print(ds[0])

{'data': tensor([1, 2], device='cuda:0')}


In [4]:
# n dim arrays
# Dataset could auto detect your shape, but that is slow
# use the Array feature type and specify the shape of your tensors:

from datasets import Dataset, Features, Array2D, ClassLabel
from pprint import pprint

data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]
features = Features({"data": Array2D(shape=(2, 2), dtype='int32')})
ds = Dataset.from_dict({"data": data}, features=features)
ds = ds.with_format("torch", device=device)
print(ds[0])

# ClassLabel data are properly converted to tensors:
labels = [0,0,1]
features = Features({"label": ClassLabel(names=["negative", "positive"])})
ds = Dataset.from_dict({"label": labels}, features=features)
ds = ds.with_format("torch", device=device)
print(ds[:3])

# String and binary objects are unchanged, since PyTorch only supports numbers.
# There's also Image and Audio Features

{'data': tensor([[1, 2],
        [3, 4]], device='cuda:0')}
{'label': tensor([0, 0, 1], device='cuda:0')}


In [5]:
# Data loading
# Like torch.utils.data.Dataset objects, a Dataset can be passed directly to a PyTorch DataLoader:
# 我突然意识到duck type的威力
# 就是哪怕是不同的库，只要符合这个接口，就可以直接使用，不需要写任何额外的代码
# 如果必须定义一个接口，就会引入额外的麻烦

import numpy as np
from datasets import Dataset 
from torch.utils.data import DataLoader 

data = np.random.rand(16)
label = np.random.randint(0, 2, size=16)
ds = Dataset.from_dict({"data": data, "label": label}).with_format("torch")
dataloader = DataLoader(ds, batch_size=4)
for batch in dataloader:
    print(batch)

# Use multiple Workers
# You can parallelize data loading with the num_workers argument of a PyTorch DataLoader and get a higher throughput.
# Under the hood, the DataLoader starts num_workers processes.
# Reloading the dataset inside a worker doesn’t fill up your RAM, since it simply memory-maps the dataset again from your disk.
from datasets import Dataset, load_from_disk
data = np.random.rand(10_000)
Dataset.from_dict({"data": data}).save_to_disk("my_dataset")
ds = load_from_disk("my_dataset").with_format("torch")
dataloader = DataLoader(ds, batch_size=32, num_workers=4)
for batch in dataloader:
    print(batch)


{'data': tensor([0.6683, 0.8513, 0.0642, 0.0631]), 'label': tensor([0, 1, 0, 1])}
{'data': tensor([0.7173, 0.7092, 0.6365, 0.6824]), 'label': tensor([0, 0, 1, 0])}
{'data': tensor([0.7200, 0.8541, 0.5976, 0.8975]), 'label': tensor([0, 1, 1, 0])}
{'data': tensor([0.3390, 0.4353, 0.6758, 0.9314]), 'label': tensor([1, 1, 1, 0])}


Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

{'data': tensor([0.4640, 0.2318, 0.9689, 0.1065, 0.9808, 0.4691, 0.9099, 0.2488, 0.4804,
        0.4170, 0.4539, 0.1056, 0.1558, 0.0899, 0.8683, 0.3149, 0.5452, 0.7174,
        0.4344, 0.4653, 0.9596, 0.1760, 0.4911, 0.0196, 0.8305, 0.2457, 0.8844,
        0.3367, 0.4609, 0.7093, 0.2250, 0.4985])}
{'data': tensor([0.9915, 0.2486, 0.4776, 0.0503, 0.1214, 0.6860, 0.5202, 0.2168, 0.6284,
        0.6314, 0.0992, 0.7056, 0.4249, 0.4050, 0.8956, 0.5615, 0.8334, 0.9263,
        0.2271, 0.9802, 0.3248, 0.3440, 0.9772, 0.6927, 0.5328, 0.9341, 0.7672,
        0.8698, 0.9127, 0.9092, 0.9868, 0.5507])}
{'data': tensor([0.3474, 0.6945, 0.9260, 0.5981, 0.7643, 0.6684, 0.8337, 0.0556, 0.1427,
        0.3584, 0.5953, 0.9621, 0.5935, 0.4425, 0.9392, 0.7509, 0.8154, 0.6868,
        0.3627, 0.7550, 0.9914, 0.8518, 0.8302, 0.3857, 0.0612, 0.6960, 0.3597,
        0.6107, 0.5415, 0.5086, 0.7156, 0.3616])}
{'data': tensor([0.7387, 0.5939, 0.9701, 0.8795, 0.5492, 0.7316, 0.3470, 0.9872, 0.5782,
        0.7167

In [6]:
# Stream data
# Stream a dataset by loading it as an IterableDataset.
# This allows you to progressively iterate over a remote dataset without downloading it on disk and or over local data files.

data = np.random.rand(10_000)
ds = Dataset.from_dict({"data": data}).to_iterable_dataset().with_format("torch")
# Too many dataloader workers: 4 (max is dataset.num_shards=1). Stopping 3 dataloader workers.
# streaming的worker和shard是相关的？
dataloader = DataLoader(ds, batch_size=32, num_workers=4)
for batch in dataloader:
    print(batch)
    break

# 原来如此
# If the dataset is split in several shards (i.e. if the dataset consists of multiple data files),
# then you can stream in parallel using num_workers:
# TODO：这样也不行，似乎只能在制作数据集的时候划分了
ds = Dataset.from_dict({"data": data}).shard(num_shards=4, index=1).to_iterable_dataset().with_format("torch")
dataloader = DataLoader(ds, batch_size=32, num_workers=4)
for batch in dataloader:
    print(batch)
    break


Too many dataloader workers: 4 (max is dataset.num_shards=1). Stopping 3 dataloader workers.


{'data': tensor([0.5180, 0.8363, 0.9521, 0.3902, 0.1740, 0.1757, 0.1858, 0.7403, 0.6982,
        0.9888, 0.1379, 0.8032, 0.4320, 0.6836, 0.1856, 0.5960, 0.0163, 0.4490,
        0.0548, 0.1642, 0.4477, 0.3474, 0.8317, 0.7901, 0.4495, 0.8742, 0.6626,
        0.2249, 0.4131, 0.3228, 0.6172, 0.8561])}


Too many dataloader workers: 4 (max is dataset.num_shards=1). Stopping 3 dataloader workers.


{'data': tensor([0.0655, 0.5011, 0.9857, 0.8026, 0.2734, 0.4893, 0.9581, 0.7485, 0.9317,
        0.8979, 0.6691, 0.5200, 0.5485, 0.8747, 0.3217, 0.6454, 0.5434, 0.5487,
        0.0481, 0.6767, 0.7471, 0.1086, 0.6881, 0.8658, 0.1224, 0.3086, 0.6108,
        0.3963, 0.6491, 0.2963, 0.8716, 0.8581])}


In [None]:
# TODO: Checkpoint and resume
# If you need a DataLoader that you can checkpoint and resume in the middle of training, you can use the StatefulDataLoader from torchdata:

# https://github.com/pytorch/data
from torchdata.stateful_dataloader import StatefulDataLoader
from datasets import load_dataset, IterableDataset

my_iterable_dataset: IterableDataset = load_dataset(path="deepmind/code_contests", streaming=True, split="train")
dataloader = StatefulDataLoader(dataset=my_iterable_dataset, batch_size=32, num_workers=4)
# save in the middle of training
state_dict = dataloader.state_dict()
# load in the middle of training
dataloader.load_state_dict(state_dict)
# 不是，这也没有保存到disk呀，难道要结合pickle?


Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

In [None]:
# Distributed， TODO 目前用不上
# If the dataset has a number of shards that is a factor of world_size (i.e. if dataset.num_shards % world_size == 0), 
# then the shards are evenly assigned across the nodes, which is the most optimized