In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# datasets基本使用

## 加载在线数据集

In [None]:
datasets = load_dataset('madao33/new-title-chinese')
datasets

## 加载数据集合中的某一项任务

In [None]:
boolq_dataset = load_dataset("super_glue", "boolq")
boolq_dataset

## 按照数据集划分进行加载

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset

In [None]:
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])
dataset

## 查看数据集

In [None]:
datasets = load_dataset("madao33/new-title-chinese")
datasets

In [None]:
datasets["train"][0]

In [None]:
datasets["train"][:2]

In [None]:
datasets["train"]["title"][:5]

In [None]:
datasets["train"].column_names

In [None]:
datasets["train"].features

## 数据集划分

In [None]:
ataset = datasets["train"]
dataset.train_test_split(test_size=0.1)

In [None]:
dataset = boolq_dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label") 

## 数据选取与过滤

In [None]:
# 选取
datasets["train"].select([0, 1])

In [None]:
# 过滤
filter_dataset = datasets["train"].filter(lambda example: "中国" in example["title"])

In [None]:
filter_dataset["title"][:5]

## 数据映射

In [None]:
def add_prefix(example):
    example["title"] = 'Prefix: ' + example["title"]
    return example

In [None]:
prefix_dataset = datasets.map(add_prefix)
prefix_dataset["train"][:10]["title"]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def preprocess_function(example, tokenizer=tokenizer):
    model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
    labels = tokenizer(example["title"], max_length=32, truncation=True)
    # label就是title编码的结果
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
processed_datasets = datasets.map(preprocess_function)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, num_proc=4)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, batched=True)
processed_datasets

In [None]:
processed_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)
processed_datasets

## 保存与加载

In [None]:
processed_datasets.save_to_disk("./processed_data")

In [None]:
processed_datasets = load_from_disk("./processed_data")
processed_datasets

# 加载本地数据集

## 直接加载文件作为数据集

In [3]:
dataset = load_dataset('csv', data_files='./ChnSentiCorp_htl_all.csv',split='train')

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-3260f3d9eacc9812/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


In [4]:
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [27]:
from datasets import Dataset

In [None]:
dataset = Dataset.from_csv('./ChnSentiCorp_htl_all.csv')
dataset

## 加载文件夹内全部文件作为数据集

In [5]:
dataset = load_dataset('csv', data_files=['./all_data/ChnSentiCorp_htl_all.csv','./all_data/ChnSentiCorp_htl_all copy.csv'], split='train')
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-dcdea1cd086b9831/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


Dataset({
    features: ['label', 'review'],
    num_rows: 15532
})

## 通过预先加载的其他格式转换加载数据集

In [6]:
import pandas as pd

data = pd.read_csv('./ChnSentiCorp_htl_all.csv')
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [26]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [14]:
# List格式的数据需要内嵌{}，明确数据字段
data = [{"text": "abc"}, {"text": "def"}]
# data = ["abc", "def"]
Dataset.from_list(data)

Dataset({
    features: ['text'],
    num_rows: 2
})

## 通过自定义加载脚本加载数据集

In [8]:
load_dataset('json', data_files='./cmrc2018_trial.json',field='data')

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-c664443259da674f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 677.70it/s]


DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title', 'id'],
        num_rows: 256
    })
})

In [None]:
dataset = load_dataset("./load_script.py", split="train")
dataset

# Dataset with DataCollator

In [9]:
from transformers import DataCollatorWithPadding

2023-10-13 15:47:07.151950: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-13 15:47:07.189653: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
dataset = load_dataset('csv', data_files='./ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-3260f3d9eacc9812/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-3260f3d9eacc9812/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-fe6f140ff6bc9b22.arrow


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../../models/bert-base-chinese/')
tokenizer

BertTokenizerFast(name_or_path='../../models/bert-base-chinese/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [14]:
def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

In [15]:
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

                                                                  

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7765
})

In [16]:
print(tokenized_dataset[:3])

{'input_ids': [[101, 6655, 4895, 2335, 3763, 1062, 6662, 6772, 6818, 117, 852, 3221, 1062, 769, 2900, 4850, 679, 2190, 117, 1963, 3362, 3221, 107, 5918, 7355, 5296, 107, 4638, 6413, 117, 833, 7478, 2382, 7937, 4172, 119, 2456, 6379, 4500, 1166, 4638, 6662, 5296, 119, 2791, 7313, 6772, 711, 5042, 1296, 119, 102], [101, 1555, 1218, 1920, 2414, 2791, 8024, 2791, 7313, 2523, 1920, 8024, 2414, 3300, 100, 2160, 8024, 3146, 860, 2697, 6230, 5307, 3845, 2141, 2669, 679, 7231, 106, 102], [101, 3193, 7623, 1922, 2345, 8024, 3187, 6389, 1343, 1914, 2208, 782, 8024, 6929, 6804, 738, 679, 1217, 7608, 1501, 4638, 511, 6983, 2421, 2418, 6421, 7028, 6228, 671, 678, 6821, 702, 7309, 7579, 749, 511, 2791, 7313, 3315, 6716, 2523, 1962, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
from torch.utils.data import DataLoader

In [20]:
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator,shuffle=True)

In [23]:
num = 0
for batch in dl:
    print(batch['input_ids'].size())
    num += 1
    if num > 10:
        break

torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 83])
torch.Size([4, 82])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 108])
torch.Size([4, 84])
torch.Size([4, 128])
