In [1]:
# 2025/7/16
# zhangzhong
# https://huggingface.co/docs/datasets/process

In [2]:
# Process
# This guide will show you how to:

# - Reorder rows and split the dataset.
# - Rename and remove columns, and other common column operations.
# - Apply processing functions to each example in a dataset.
# - Concatenate datasets.
# - Apply a custom formatting transform.
# - Save and export processed datasets.

In [3]:
# !!!
# All processing methods in this guide return a **new** Dataset object
# Modification is **not** done in-place

In [4]:
from datasets import load_dataset, Dataset
from pprint import pprint

dataset: Dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")

In [5]:
# 1. Reorder rows and split the dataset.
# There are several functions for rearranging the structure of a dataset.
# These functions are useful for selecting only the rows you want, creating train and test splits,
#  and sharding very large datasets into smaller chunks.

In [6]:
# Sort
# Use sort() to sort column values according to their numerical values
# The provided column must be **NumPy** compatible.

# !!! indicies mapping
# 其实并没有修改原始数据，相当于间接排序
# Under the hood, this creates a list of indices that is sorted according to values of the column
# This indices mapping is then used to access the right rows in the underlying Arrow table.

print(dataset["label"][:10])

sorted_dataset: Dataset = dataset.sort(column_names="label")
print(sorted_dataset["label"][:10])
print(sorted_dataset["label"][-10:])

[1, 0, 1, 0, 1, 1, 0, 1, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
# Shuffle
# The shuffle() function randomly rearranges the column values. 
# Shuffling takes the list of indices [0:len(my_dataset)] and shuffles it to create an indices mapping

# However as soon as your Dataset has an indices mapping, the speed can become 10x slower.
# To restore the speed, you’d need to rewrite the entire dataset on your disk again using Dataset.flatten_indices(),
#  which removes the indices mapping.

# wocao! iterable也可以做shuffle？NB
# Alternatively, you can switch to an IterableDataset and leverage its fast approximate shuffling IterableDataset.shuffle():

shuffled_dataset: Dataset = sorted_dataset.shuffle(seed=42)
shuffled_dataset["label"][:10]

[1, 1, 1, 0, 1, 1, 1, 1, 1, 0]

In [8]:
# Select and filter
# Unless the list of indices to keep is contiguous, those methods also create an indices mapping under the hood.

# select() returns rows according to a list of indices:
small_dataset: Dataset = shuffled_dataset.select([0, 10, 20, 30, 40, 50])
print(len(small_dataset))

# filter() returns rows that match a specified condition:
start_with_ar: Dataset = dataset.filter(function=lambda example: example["sentence1"].startswith("Ar"))
print(len(start_with_ar))

# filter() can also filter by indices if you set with_indices=True:
even_dataset: Dataset = dataset.filter(function=lambda example, idx: idx %2==0, with_indices=True)
print(len(even_dataset))

6
6
1834


In [9]:
# Split
# The train_test_split() function creates train and test splits if your dataset doesn’t already have them. 
# test_size parameter to create a test split that is 10% of the original dataset:
# The splits are shuffled by default, but you can set shuffle=False to prevent shuffling.

print(dataset)
train_test_split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
print(train_test_split_dataset)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3301
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 367
    })
})


In [10]:
# Shard
# 🤗 Datasets supports sharding to divide a very large dataset into a predefined number of chunks.
#  

dataset: Dataset = load_dataset("stanfordnlp/imdb", split="train")
print(dataset)

# Specify the **num_shards** parameter in shard() to determine the number of shards to split the dataset into
# rovide the shard you want to return with the **index** parameter.
# After sharding the dataset into four chunks, the first shard will only have 6250 examples:
sharded_dataset: Dataset = dataset.shard(num_shards=4, index=0)
print(sharded_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 6250
})


In [11]:
# Rename, remove, cast, flatten

#  Features associated with the original column are actually moved under the new column name, instead of just replacing the original column in-place.
# Rename
dataset: Dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")
print(dataset)

dataset = dataset.rename_column(original_column_name="sentence1", new_column_name="sentenceA")
dataset = dataset.rename_column(original_column_name="sentence2", new_column_name="sentenceB")
print(dataset)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
Dataset({
    features: ['sentenceA', 'sentenceB', 'label', 'idx'],
    num_rows: 3668
})


In [12]:
# Remove

dataset = dataset.remove_columns(column_names=["sentenceA", "sentenceB"])
print(dataset)

Dataset({
    features: ['label', 'idx'],
    num_rows: 3668
})


In [13]:
# 
# Conversely, select_columns() selects one or more columns to keep and removes the rest

dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")
print(dataset)

dataset = dataset.select_columns(column_names=["sentence1", "sentence2", "label"])
print(dataset)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 3668
})


In [14]:
# Cast
# Casting only works if the original feature type and new feature type are compatible

dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")
pprint(dataset.features)

from datasets import ClassLabel, Value

new_features = dataset.features.copy()
new_features["label"] = ClassLabel(names=['negative', 'positive'])
new_features["idx"] = Value(dtype="int64")
dataset = dataset.cast(features=new_features)
pprint(dataset.features)

# Use the cast_column() function to change the feature type of a single column.
dataset = dataset.cast_column(column="idx", feature=Value(dtype="int32"))
pprint(dataset.features)

{'idx': Value('int32'),
 'label': ClassLabel(names=['not_equivalent', 'equivalent']),
 'sentence1': Value('string'),
 'sentence2': Value('string')}
{'idx': Value('int64'),
 'label': ClassLabel(names=['negative', 'positive']),
 'sentence1': Value('string'),
 'sentence2': Value('string')}
{'idx': Value('int32'),
 'label': ClassLabel(names=['negative', 'positive']),
 'sentence1': Value('string'),
 'sentence2': Value('string')}


In [15]:
# Flatten
# Sometimes a column can be a nested structure of several types

dataset = load_dataset("rajpurkar/squad", split="train")
pprint(dataset.features)

# The answers field contains two subfields: text and answer_start.
# Use the flatten() function to extract the subfields into their own separate columns:
flat_dataset = dataset.flatten()
pprint(flat_dataset.features)


{'answers': {'answer_start': List(Value('int32')),
             'text': List(Value('string'))},
 'context': Value('string'),
 'id': Value('string'),
 'question': Value('string'),
 'title': Value('string')}
{'answers.answer_start': List(Value('int32')),
 'answers.text': List(Value('string')),
 'context': Value('string'),
 'id': Value('string'),
 'question': Value('string'),
 'title': Value('string')}


In [16]:
# Map
# The primary purpose of map() is to speed up processing functions
# It allows you to apply a processing function to each example in a dataset, independently or in batches
# This function can even create new rows and columns.

dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")

# !!! 这个函数只能用来处理单个example
# 如果是batch，那么 examples["sentence1"] 就是一个list
def add_prefix(example):
    example["sentence1"] = "My sentence: " + example["sentence1"]
    return example

def add_prefix_batch(examples):
    examples["sentence1"] = ["My sentence: " + sentence for sentence in examples["sentence1"]]
    return examples

updated_dataset = dataset.map(function=add_prefix_batch, batched=True)
pprint(updated_dataset["sentence1"][:2], compact=True)

# remove a column, actually rename a column
# Datasets also has a remove_columns() function which is faster because it doesn’t copy the data of the remaining columns.
updated_dataset = dataset.map(function=lambda example: {"new_sentence": example["sentence1"]}, remove_columns=["sentence1"])
print(updated_dataset.column_names)

# You can also use map() with indices if you set with_indices=True
updated_dataset = dataset.map(function=lambda example, idx: {"sentence2": f"{idx}: {example['sentence2']}"}, with_indices=True)
pprint(updated_dataset["sentence2"][:5], compact=True)

['My sentence: Amrozi accused his brother , whom he called " the witness " , '
 'of deliberately distorting his evidence .',
 "My sentence: Yucaipa owned Dominick 's before selling the chain to Safeway "
 'in 1998 for $ 2.5 billion .']
['sentence2', 'label', 'idx', 'new_sentence']
['0: Referring to him as only " the witness " , Amrozi accused his brother of '
 'deliberately distorting his evidence .',
 "1: Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to "
 'Safeway for $ 1.8 billion in 1998 .',
 "2: On June 10 , the ship 's owners had published an advertisement on the "
 'Internet , offering the explosives for sale .',
 '3: Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A '
 '$ 4.57 .',
 '4: PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York '
 'Stock Exchange on Friday .']


In [17]:
# Multiprocessing Map
# Multiprocessing significantly speeds up processing by parallelizing processes on the CPU
# Set the num_proc parameter in map() to set the number of processes to use:

updated_dataset: Dataset = dataset.map(
    function=lambda example, idx: {"sentence2": f"{idx}: " + example["sentence2"]},
    with_indices=True,
    num_proc=4,
)

# 我懂了，with rank就知识提供一个进程的编号
# 在preprocess的参数里面
# 我们可以用这个rank来做一些映射，比如映射到gpu上
# 这里的 rank 是 Hugging Face datasets.map(..., with_rank=True) 提供的 进程编号，用于在多个进程中识别当前处理的是第几个进程，从而选择不同的 GPU
#
# def gpu_computation(batch, rank): # ！！！注意这里的rank参数，这个函数就是传递给map的函数
#     # Move the model on the right GPU if it's not there already
#     device = f"cuda:{(rank or 0) % torch.cuda.device_count()}"
#     model.to(device)
# 
#     # Your big GPU call goes here, for example:
#     chats = [[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": prompt}
#     ] for prompt in batch["prompt"]]
#     texts = [tokenizer.apply_chat_template(
#         chat,
#         tokenize=False,
#         add_generation_prompt=True
#     ) for chat in chats]
#     model_inputs = tokenizer(texts, padding=True, return_tensors="pt").to(device)
#     with torch.no_grad():
#         outputs = model.generate(**model_inputs, max_new_tokens=512)
#     batch["output"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#     return batch
#
# updated_dataset= dataset.map(function=gpu_computation, with_rank=True, num_proc=torch.cuda.device_count())


# TODO：今晚写训练代码的时候仔细研究
# https://huggingface.co/docs/datasets/process#multiprocessing
# 这里的处理好像非常关键啊，我突然意识到我们根本就不需要streaming啊，因为本地加载是用mmap做的
# 就算是超大的数据也不会占用大量的内存，这段代码应该就是我们做DDP的核心代码了！


In [None]:
# Batch processing
# The map() function supports working with batches of examples.
# Operate on batches by setting batched=True
# The default batch size is 1000, but you can adjust it with the batch_size parameter.


dataset = load_dataset(path="nyu-mll/glue", name="mrpc", split="train")

# split long examples
# 1. splits the sentence1 field into chunks of 50 characters
# 2. stack all the chunks together to create the new dataset
def chunk_examples(examples):
    chunks = []
    for sentence in examples["sentence1"]:
        chunks.extend([sentence[i : i + 50] for i in range(0, len(sentence), 50)])
    return {"chunks": chunks}


chunked_dataset = dataset.map(
    function=chunk_examples,
    batched=True,
    batch_size=1000,
    # 不加这一行就会报错，Column 4 named chunks expected length 1000 but got length 2847
    # 要和这个东西搭配，因为我们的chunk_example返回的行数比原始的数据集要多
    # 因为chunk_examples函数返回的长度应该和传入的examples，也就是batch_size一样
    remove_columns=dataset.column_names,
)
print(len(dataset))
print(len(chunked_dataset))
print(chunked_dataset["chunks"][:3])

# 那这样想象空间就很大了
# hugging face上还有一个data augmentaiton的例子
# Create a function to randomly select a word to mask in the sentence.
# def augment_data(examples):
#     outputs = []
#     for sentence in examples["sentence1"]:
#         words = sentence.split(' ')
#         K = randint(1, len(words)-1)
#         masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
#         predictions = fillmask(masked_sentence)
#         augmented_sequences = [predictions[i]["sequence"] for i in range(3)]
#         outputs += [sentence] + augmented_sequences
#     return {"data": outputs}

3668
10470
['Amrozi accused his brother , whom he called " the ', 'witness " , of deliberately distorting his evidenc', 'e .']


In [None]:
# https://huggingface.co/docs/datasets/process#asynchronous-processing
# TODO: read this if needed, 简单的说就是map也支持async函数，支持并发，默认的并发是1000

In [25]:
# https://huggingface.co/docs/datasets/process#process-multiple-splits
# Many datasets have splits that can be processed simultaneously with DatasetDict.map().
from datasets import DatasetDict
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset 

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")

dataset_dict: DatasetDict = load_dataset(path="nyu-mll/glue", name="mrpc")
encoded_dataset = dataset_dict.map(lambda examples: tokenizer(examples["sentence1"]), batched=True)
pprint(encoded_dataset["train"][0], compact=True)


{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1],
 'idx': 0,
 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170,
               1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436,
               2010, 3350, 1012, 102],
 'label': 1,
 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , '
              'of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his '
              'brother of deliberately distorting his evidence .',
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0]}


In [None]:
# https://huggingface.co/docs/datasets/process#distributed-usage
# 分布式训练的时候，需要注意
# This ensures the main process performs the mapping, while the other processes load the results, thereby avoiding duplicate work.
# ？？？
# 不可以不同的进程独立的处理数据，分别喂给大模型吗？

In [None]:
# Batch
# The batch() method allows you to group samples from the dataset into batches.
# Note that Dataset.batch() returns a new Dataset where each item is a batch of multiple samples from the original datase

dataset: Dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")
# The batch() method accepts the following parameters:
# 
# batch_size (int): The number of samples in each batch.
# drop_last_batch (bool, defaults to False): Whether to drop the last incomplete batch if the dataset size is not divisible by the batch size.
# num_proc (int, optional, defaults to None): The number of processes to use for multiprocessing. If None, no multiprocessing is used. This can significantly speed up batching for large datasets.
batched_dataset = dataset.batch(batch_size=4)
pprint(batched_dataset[0:2], compact=True)

Batching examples:   0%|          | 0/8530 [00:00<?, ? examples/s]

{'label': [[1, 1, 1, 1], [1, 1, 1, 1]],
 'text': [['the rock is destined to be the 21st century\'s new " conan " and '
           "that he's going to make a splash even greater than arnold "
           'schwarzenegger , jean-claud van damme or steven segal .',
           'the gorgeously elaborate continuation of " the lord of the rings " '
           'trilogy is so huge that a column of words cannot adequately '
           "describe co-writer/director peter jackson's expanded vision of j . "
           "r . r . tolkien's middle-earth .",
           'effective but too-tepid biopic',
           'if you sometimes like to go to the movies to have fun , wasabi is '
           'a good place to start .'],
          ["emerges as something rare , an issue movie that's so honest and "
           "keenly observed that it doesn't feel like one .",
           'the film provides some great insight into the neurotic mindset of '
           'all comics -- even those who have reached the absolute top o

In [27]:
# Concatenate
# Separate datasets can be concatenated if they share the same column types.

# from datasets import concatenate_datasets, load_dataset

# stories = load_dataset("ajibawa-2023/General-Stories-Collection", split="train")
# stories = stories.remove_columns([col for col in stories.column_names if col != "text"])  # only keep the 'text' column
# wiki = load_dataset("wikimedia/wikipedia", "20220301.en", split="train")
# wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column

# assert stories.features.type == wiki.features.type
# bert_dataset = concatenate_datasets([stories, wiki])

# # You can also concatenate two datasets horizontally by setting axis=1 as long as the datasets have the same number of rows:
# # 相当于添加新的列啊
# from datasets import Dataset
# stories_ids = Dataset.from_dict({"ids": list(range(len(stories)))})
# stories_with_ids = concatenate_datasets([stories, stories_ids], axis=1)

In [None]:
# Interleave
# You can also mix several datasets together by taking alternating examples from each one to create a new dataset. 
# This is known as interleaving, which is enabled by the interleave_datasets() function

# stopping_strategy
# The default strategy, first_exhausted, is a subsampling strategy, 
# i.e the dataset construction is stopped as soon one of the dataset runs out of samples.
#
# stopping_strategy=all_exhausted to execute an oversampling strategy
# In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once
# In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached
# ! epoch=2


from datasets import Dataset, interleave_datasets
seed = 42
probabilities = [0.3, 0.5, 0.2] # wocao,采样率！
d1 = Dataset.from_dict({"a": [0, 1, 2]})
d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
d3 = Dataset.from_dict({"a": [20, 21, 22]})
dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed, stopping_strategy="first_exhausted")
print(dataset["a"])

# 有bug？怎么结果是一样的？
dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed, stopping_strategy="all_exhausted")
print(dataset["a"])


Column([10, 11, 20, 12, 0])
Column([10, 11, 20, 12, 0])


In [None]:
# 确实有bug！
# TODO：可以调试一下
d1 = Dataset.from_dict({"a": [0, 1, 2]})
d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
d3 = Dataset.from_dict({"a": [20, 21, 22]})
dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
dataset["a"]

Column([0, 10, 20, 1, 11])

In [None]:
# # format
# # The with_format() function changes the format of a column to be compatible with some common data formats
# # Specify the output you’d like in the type parameter. such as type=torch
# # You can also choose which the columns you want to format using columns=. Formatting is applied on-the-fly.
# dataset = dataset.with_format(type="torch", columns=["a"])

# # danger! The set_format() function also changes the format of a column, except it runs in-place:
# dataset.set_format(type="torch")

# # If you need to reset the dataset to its original format, set the format to None (or use reset_format()):
# dataset.format
# dataset = dataset.with_format(None)
# dataset.format

In [None]:
# Tensor format: numpy, torch, tensorflow, jax
# When a dataset is formatted in a tensor or array format, 
# all the data are formatted as tensors or arrays (except unsupported types like strings for example for PyTorch):
ds = Dataset.from_dict({"text": ["foo", "bar"], "tokens": [[0, 1, 2], [3, 4, 5]]})
ds = ds.with_format("torch")
ds[0]
ds[:2]

{'text': ['foo', 'bar'],
 'tokens': tensor([[0, 1, 2],
         [3, 4, 5]])}

In [None]:
# tabular format: pandas, polars, arrow
# custom format

In [None]:
# Save
# Once your dataset is ready, you can save it as a Hugging Face Dataset in **Parquet** format and reuse it later with load_dataset().
# use push_to_hub
# You can use multiple processes to upload it in paralle
# dataset.push_to_hub("username/my_dataset", num_proc=8)

# Alternatively, you can save it locally in **Arrow** format on disk.
# encoded_dataset.save_to_disk("path/of/my/dataset/directory")
# reloaded_dataset = load_from_disk("path/of/my/dataset/directory")

In [None]:
# Export
# datasets supports exporting as well so you can work with your dataset in other applications
# dataset.to_csv("path/of/my/dataset.csv")
# dataset.to_json("path/of/my/dataset.json")
# dataset.to_parquet("path/of/my/dataset.parquet")
# dataset.to_sql("sqlite:///path/of/my/dataset.db", table_name="my_table")
# dataset.to_pandas()
# dataset.to_dict()
# ...