In [1]:
# 2025/7/16
# zhangzhong
# https://huggingface.co/docs/datasets/tutorial

In [2]:
# Load a dataset
# Before you take the time to download a dataset, it’s often helpful to quickly get some general information about a dataset
# A dataset’s information is stored inside DatasetInfo and can include information such as the dataset description, features, and dataset size.
# https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.DatasetInfo

from datasets import load_dataset_builder
from datasets.builder import DatasetBuilder
from pprint import pprint

ds_builder: DatasetBuilder = load_dataset_builder(path="cornell-movie-review-data/rotten_tomatoes")
pprint(ds_builder.info, compact=True)

DatasetInfo(description='',
            citation='',
            homepage='',
            license='',
            features={'label': ClassLabel(names=['neg', 'pos']),
                      'text': Value('string')},
            post_processed=None,
            supervised_keys=None,
            builder_name='parquet',
            dataset_name='rotten_tomatoes',
            config_name='default',
            version=0.0.0,
            splits={'test': SplitInfo(name='test',
                                      num_bytes=136102,
                                      num_examples=1066,
                                      shard_lengths=None,
                                      dataset_name='rotten_tomatoes'),
                    'train': SplitInfo(name='train',
                                       num_bytes=1075873,
                                       num_examples=8530,
                                       shard_lengths=None,
                                       dataset_name='ro

In [3]:
from datasets import load_dataset

dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")
pprint(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})


In [4]:
# https://huggingface.co/docs/datasets/load_hub#splits
# Splits: A split is a specific subset of a dataset like train and test. 
# List a dataset’s split names with the get_dataset_split_names() function:

from datasets import get_dataset_split_names
split_names = get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")
pprint(split_names)

# Then you can load a specific split with the split parameter. Loading a dataset split returns a Dataset object:
# https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.Dataset
from datasets import Dataset 

# Load a specific split
dataset: Dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")
pprint(dataset)

# If you don’t specify a split, 🤗 Datasets returns a DatasetDict object instead:
from datasets import DatasetDict
dataset_dict: DatasetDict = load_dataset("cornell-movie-review-data/rotten_tomatoes")
pprint(dataset_dict)

# 如果设置了stream=True, 就会返回iterable
# If set to True, don't download the data files. Instead, it streams the data progressively while iterating on the dataset. An [IterableDataset] or [IterableDatasetDict] is returned instead in this case.


['train', 'validation', 'test']
Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})


In [5]:
# Configurations
# Some datasets contain several sub-datasets.
# These sub-datasets are known as configurations or subsets, and you **must** explicitly select one when loading the dataset.

from datasets import get_dataset_config_names

configs = get_dataset_config_names("GPUMODE/kernelbot-data")
pprint(configs)

# then load the configuartion you want
dataset = load_dataset(path="GPUMODE/kernelbot-data", name="submissions")
pprint(dataset)

# 就算是dataset builder，也必须传入name，不同的name就是看作不同的数据集的！
# 直接像这样获取ds builder会报错
# ds_builder = load_dataset_builder(path="GPUMODE/kernelbot-data")
# pprint(ds_builder.info, compact=True)

['submissions', 'successful_submissions', 'leaderboards']
DatasetDict({
    train: Dataset({
        features: ['submission_id', 'leaderboard_id', 'user_id', 'submission_time', 'file_name', 'code', 'code_id', 'run_id', 'run_start_time', 'run_end_time', 'run_mode', 'run_score', 'run_passed', 'run_result', 'run_compilation', 'run_meta', 'run_system_info'],
        num_rows: 40095
    })
})


In [None]:
# Know your dataset
# There are two types of dataset objects, a regular Dataset and then an ✨ IterableDataset ✨
#
# https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.Dataset
# A Dataset provides fast random access to the rows, and memory-mapping so that loading even large datasets only uses a relatively small amount of device memory.
# 
# https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.IterableDataset
# But for really, really big datasets that won’t even fit on disk or in memory, an IterableDataset allows you to access and use the dataset without waiting for it to download completely!

In [8]:
# Dataset
# When you load a dataset split, you’ll get a Dataset object.

dataset: Dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")

# Indexing
# A Dataset contains columns of data, and each column can be a different type of data. 
print(dataset[0]) # # Get the first row in the dataset
print(dataset[-1]) # Get the last row in the dataset
print(dataset['text']) # Indexing by the column name returns a list of all the values in the column:
print(dataset[0]["text"]) # Get the value of the "text" column in the first row

# Slicing
print(dataset[:3]) # Get the first three rows
print(dataset[3:6]) # Get rows between three and six

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
{'text': 'things really get weird , though not particularly scary : the movie is all portent and no content .', 'label': 0}
Column(['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'effective but too-tepid biopic', 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."])
the ro

In [14]:
# Iterable Dataset
# An IterableDataset is loaded when you set the streaming parameter to True in load_dataset():
# An IterableDataset progressively iterates over a dataset one example at a time, 
# so you don’t have to wait for the whole dataset to download before you can use it

from datasets import IterableDataset

iterable_dataset: IterableDataset = load_dataset(path="ethz/food101", split="train", streaming=True)
for example in iterable_dataset:
    print(example)
    break  # Just print the first example to avoid flooding the output

# You can also create an IterableDataset from an existing Dataset, but it is faster than streaming mode because the dataset is streamed from local files:
dataset: Dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")
iterable_dataset: IterableDataset = dataset.to_iterable_dataset()

# Indexing
# !!!  You don’t get random access to examples in an IterableDataset. Instead, you should iterate over its elements,
# by calling next(iter()) or with a for loop to return the next item from the IterableDataset:
print(next(iter(iterable_dataset)))  # Get the first example

for example in iterable_dataset:
    print(example)
    break  # Just print the first example to avoid flooding the output

# But an IterableDataset supports column indexing that returns an iterable for the column values:
print(next(iter(iterable_dataset['text'])))

# Subset
# IterableDataset.take() creates a new IterableDataset.
print(list(iterable_dataset.take(3)))  # Get the first three examples as a list

{'image': <PIL.Image.Image image mode=RGB size=384x512 at 0x7F252A06F5C0>, 'label': 6}
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
[{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}, {'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words ca

In [None]:
# TODO: https://huggingface.co/docs/datasets/about_mapstyle_vs_iterable

In [None]:
# Preprocess
# TODO: read the https://huggingface.co/docs/datasets/use_dataset#resample-audio-signals if needed.
# TODO: read the https://huggingface.co/docs/datasets/use_dataset#apply-data-augmentations if needed.

# Tokenize
# Models cannot process raw text, so you’ll need to convert the text into numbers
# Tokenization provides a way to do this by dividing text into individual words called tokens.
# Tokens are finally converted to numbers.

In [None]:
# 1. Load a tokenizer
# Using the **same** tokenizer as the pretrained model is important because you want to make sure the text is split in the same way.

from transformers import AutoTokenizer
from datasets import load_dataset, Dataset 

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")
dataset: Dataset = load_dataset(path="cornell-movie-review-data/rotten_tomatoes", split="train")

In [None]:
# 2. Call your tokenizer on the first row of text in the dataset:

# The tokenizer returns a dictionary with three items:
# - input_ids: the numbers representing the tokens in the text.
# - token_type_ids: indicates which sequence a token belongs to if there is more than one sequence.
# - attention_mask: indicates whether a token should be masked or not.
# These values are actually the model inputs.

pprint(tokenizer(dataset[0]["text"]), compact=True)
pprint(tokenizer(dataset[0]["text"], dataset[1]['text']), compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005,
               1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055,
               2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058,
               8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168,
               2030, 7112, 16562, 2140, 1012, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0]}
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [20]:
# 3. The fastest way to tokenize your entire dataset is to use the **map()** function
# This function speeds up tokenization by applying the tokenizer to **batches** of examples instead of individual examples
# Set the batched parameter to True:

def tokenization(examples):
    return tokenizer(examples['text'])

dataset = dataset.map(function=tokenization, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

In [22]:
# 4. Set the format of your dataset to be compatible with your machine learning framework:
# Use the set_format() function to set the dataset format to be compatible with PyTorch:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"], output_all_columns=False)
print(dataset.format['type'])

torch


In [None]:
# https://huggingface.co/docs/datasets/create_dataset
# fast loading and processing,
# stream enormous datasets, 
# memory-mapping
# TODO: read this when needed.

In [None]:
# https://huggingface.co/docs/datasets/upload_dataset
# TODO: read this when needed.
# https://huggingface.co/docs/datasets/share 这里面有更多的上传数据集相关的资料
# https://huggingface.co/docs/datasets/dataset_card
# https://huggingface.co/docs/datasets/repository_structure