##### 本节使用 datasets 中的 stream 加载数据, hugging face 官网文档:https://huggingface.co/docs/datasets/v2.18.0/en/stream
###### 1. 注意流加载的切片数与文件数有关
###### 2. 注意流加载的shuffle与普通datasets shuffle的不同, 流加载的shuffle 是对切片打乱后,对 前buffer_size的数据随机抽取, 普通datasets shuffle 是对全局进行打乱,因此流加载shuffle没有标准shuffle充分, 但随着切片数的提升, 流加载的shuffle也会逐渐均匀
###### 3. 流加载不会自动保存 .cache 的arrow文件

In [None]:
# 导入huggingface datasets 包
# 详情见:https://huggingface.co/docs/datasets/v2.18.0/en/loading
import datasets
from datasets import Dataset
from datasets import load_dataset,Features,Value

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# datasets 清除缓存cache
# 默认的cache路径在 ~/.cache/huggingface/datasets, 手动清理所有缓存,也可以进入目录后清除指定的缓存
# jupyter 中使用 linux 命令,必须在前面加入!
!rm -rf ~/.cache/huggingface/datasets
# # 使用 cache 可以使得优化再次加载数据的速度, 但也暂用了大量的硬盘资源
# # 在下载数据集后，可以通过 `load_dataset()` 函数的 `download_mode` 参数来控制加载方式。默认情况下，🤗 Datasets 会重用已存在的数据集。但是如果您需要原始数据集而不应用任何处理函数，请按照以下示例重新下载文件：
# # download_mode = "reuse_cache_if_exists", 具体见: https://huggingface.co/docs/datasets/v2.18.0/en/cache
# 例如: my_dataset = load_dataset('text',data_files=files,num_proc=9,
#                           download_mode = "reuse_cache_if_exists")

In [3]:
# 定于 hugging face .cache 的目录
cache_dir = "/data/temp/julyedu_634415/.cache/huggingface/datasets"

# 定义文件相关的根目录
file_root = "/data/temp/julyedu_634415/testdatas/"
file_root

'/data/temp/julyedu_634415/testdatas/'

In [4]:
# # 创建数据
# line_num = 750000
# for i in range(100):
#     with open(f"{file_root}{i}.txt",'w',encoding='utf-8') as f:
#         for j in range(line_num):
#             f.write(f'这是第{i*line_num+j+1}行数据+++++++++++++++++++++++++++++++++++++++++')
#             f.write('\n')
#     print(f"finish: doc {i}")

finish: doc 0
finish: doc 1
finish: doc 2
finish: doc 3
finish: doc 4
finish: doc 5
finish: doc 6
finish: doc 7
finish: doc 8
finish: doc 9
finish: doc 10
finish: doc 11
finish: doc 12
finish: doc 13
finish: doc 14
finish: doc 15
finish: doc 16
finish: doc 17
finish: doc 18
finish: doc 19
finish: doc 20
finish: doc 21
finish: doc 22
finish: doc 23
finish: doc 24
finish: doc 25
finish: doc 26
finish: doc 27
finish: doc 28
finish: doc 29
finish: doc 30
finish: doc 31
finish: doc 32
finish: doc 33
finish: doc 34
finish: doc 35
finish: doc 36
finish: doc 37
finish: doc 38
finish: doc 39
finish: doc 40
finish: doc 41
finish: doc 42
finish: doc 43
finish: doc 44
finish: doc 45
finish: doc 46
finish: doc 47
finish: doc 48
finish: doc 49
finish: doc 50
finish: doc 51
finish: doc 52
finish: doc 53
finish: doc 54
finish: doc 55
finish: doc 56
finish: doc 57
finish: doc 58
finish: doc 59
finish: doc 60
finish: doc 61
finish: doc 62
finish: doc 63
finish: doc 64
finish: doc 65
finish: doc 66
finis

In [5]:
# 遍历要加载的数据
import glob
files = glob.glob(f"{file_root}*.txt")
files

['/data/temp/julyedu_634415/testdatas/96.txt',
 '/data/temp/julyedu_634415/testdatas/72.txt',
 '/data/temp/julyedu_634415/testdatas/71.txt',
 '/data/temp/julyedu_634415/testdatas/45.txt',
 '/data/temp/julyedu_634415/testdatas/28.txt',
 '/data/temp/julyedu_634415/testdatas/3.txt',
 '/data/temp/julyedu_634415/testdatas/57.txt',
 '/data/temp/julyedu_634415/testdatas/99.txt',
 '/data/temp/julyedu_634415/testdatas/67.txt',
 '/data/temp/julyedu_634415/testdatas/50.txt',
 '/data/temp/julyedu_634415/testdatas/33.txt',
 '/data/temp/julyedu_634415/testdatas/6.txt',
 '/data/temp/julyedu_634415/testdatas/23.txt',
 '/data/temp/julyedu_634415/testdatas/59.txt',
 '/data/temp/julyedu_634415/testdatas/46.txt',
 '/data/temp/julyedu_634415/testdatas/10.txt',
 '/data/temp/julyedu_634415/testdatas/55.txt',
 '/data/temp/julyedu_634415/testdatas/21.txt',
 '/data/temp/julyedu_634415/testdatas/54.txt',
 '/data/temp/julyedu_634415/testdatas/75.txt',
 '/data/temp/julyedu_634415/testdatas/86.txt',
 '/data/temp/ju

##### 加载自己的txt数据

In [6]:
# streaming=True 使用流加载, 
# 流加载不会在 .cache 生成 arrow 文件
# 使用stream加载时,需要对大数据的文件进行切块,简单来说就是切分成多个小文件,这样stream加载后,文件加载后切片的数量==文件的数量
# 而且后期stream shuffle时,切片比较多时,更加shuffle的均匀

# 假设每个文本文件只包含一列文本内容，我们要将其命名为"text", 元数据
dataset_features = Features({'text': Value('string')})
my_dataset = load_dataset('text',
                          data_files=
                                {"train_data":files[:80],
                                 "test_data":files[80:]},
                          cache_dir=cache_dir,
                          streaming=True,
                          features=dataset_features)
print(my_dataset)

# 进行shuffle, buffer_size 是缓冲大小,一般只有 stream 模式shuffle时才开启, 默认为1000
iter_train_data = my_dataset['train_data'].shuffle(42,buffer_size=20000)


# 使用iter的方法获取元素
iter_tmp = iter(iter_train_data)
print(next(iter(iter_tmp)))
print(next(iter(iter_tmp)),end="\n======================\n")

# 使用 for 获取元素
for i in iter_train_data:
    print(i)
    break

# 取出前10个元素
list(iter_train_data.take(10))

Resolving data files: 100%|██████████| 80/80 [00:00<00:00, 298526.98it/s]
Resolving data files: 100%|██████████| 20/20 [00:00<00:00, 169125.16it/s]


IterableDatasetDict({
    train_data: IterableDataset({
        features: ['text'],
        n_shards: 80
    })
    test_data: IterableDataset({
        features: ['text'],
        n_shards: 20
    })
})
{'text': '这是第20251786行数据+++++++++++++++++++++++++++++++++++++++++'}
{'text': '这是第20265480行数据+++++++++++++++++++++++++++++++++++++++++'}
{'text': '这是第20251786行数据+++++++++++++++++++++++++++++++++++++++++'}


[{'text': '这是第20251786行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20265480行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20263092行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20258778行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20258661行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20267172行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20251719行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20263948行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20254030行数据+++++++++++++++++++++++++++++++++++++++++'},
 {'text': '这是第20251884行数据+++++++++++++++++++++++++++++++++++++++++'}]

In [7]:
def func_replace(item):
    item["text"] = item["text"].replace("+++++++++++++++++++","==")
    return item

# map 用法
iter_train_data = iter_train_data.map(func_replace)
list(iter_train_data.take(10))


[{'text': '这是第20251786行数据====+++'},
 {'text': '这是第20265480行数据====+++'},
 {'text': '这是第20263092行数据====+++'},
 {'text': '这是第20258778行数据====+++'},
 {'text': '这是第20258661行数据====+++'},
 {'text': '这是第20267172行数据====+++'},
 {'text': '这是第20251719行数据====+++'},
 {'text': '这是第20263948行数据====+++'},
 {'text': '这是第20254030行数据====+++'},
 {'text': '这是第20251884行数据====+++'}]