In [54]:

from datasets import load_dataset
import json
import os
import gzip

# 設定
save_dir = "../data/original_dump/jap2010/"
batch_size = 10000


def load(parquet_id):
    # streamingだと､なぜか15k行目くらいでerrorが出てくる
    return load_dataset('hatakeyama-llm-team/japanese2010',
                        split='train',
                        data_files=f"https://huggingface.co/datasets/hatakeyama-llm-team/japanese2010/resolve/refs%2Fconvert%2Fparquet/default/train/{parquet_id}.parquet"
                        #streaming=True
                        )


def save_jsonl_gz(data, filename):
    """指定されたファイル名でgzip圧縮されたJSON Lines形式でデータを保存する"""
    with gzip.open(filename, 'wt', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')


def batch_and_save(parquet_id):
    try:
        dataset = load(parquet_id)
    except:
        print(f"Failed to load {parquet_id}")
        return False
    """データセットをバッチ処理し、各バッチをgzip圧縮されたファイルに保存する"""
    batch = []
    file_count = 0
    #for item in dataset:
    for i in range(len(dataset)):
        try:
            text=dataset[i]['text']
            batch.append({"text": text})
        except Exception as e:
            print(e)
        continue

        if len(batch) == batch_size:
            save_path = os.path.join(
                save_dir, f'jap2010_{parquet_id}_{file_count}.jsonl.gz')
            save_jsonl_gz(batch, save_path)
            print(f'Saved {save_path}')
            batch = []
            file_count += 1
    # 最後のバッチを保存
    if batch:
        save_path = os.path.join(save_dir, f'jap2010_{parquet_id}_{file_count}.jsonl.gz')
        save_jsonl_gz(batch, save_path)
        print(f'Saved {save_path}')


    return True

In [56]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#batch_and_save(dataset)

# 生成する文字列の数
num_strings = 9999
ids= [f"{i:04}" for i in range(1, num_strings + 1)]

for parquet_id in ids:
    print(parquet_id)
    if not batch_and_save(parquet_id):
        break


0001
Saved ../data/original_dump/jap2010/jap2010_0001_0.jsonl.gz
0002


Downloading data: 100%|██████████| 140M/140M [00:21<00:00, 6.61MB/s]
Generating train split: 2516 examples [00:01, 2321.74 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0002_0.jsonl.gz
0003


Downloading data: 100%|██████████| 132M/132M [00:17<00:00, 7.37MB/s]
Generating train split: 1911 examples [00:01, 1833.51 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0003_0.jsonl.gz
0004


Downloading data: 100%|██████████| 113M/113M [00:15<00:00, 7.30MB/s]
Generating train split: 1583 examples [00:00, 1594.17 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0004_0.jsonl.gz
0005


Downloading data: 100%|██████████| 137M/137M [00:18<00:00, 7.51MB/s]
Generating train split: 2250 examples [00:01, 2088.07 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0005_0.jsonl.gz
0006


Downloading data: 100%|██████████| 147M/147M [00:19<00:00, 7.51MB/s]
Generating train split: 1992 examples [00:01, 1659.77 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0006_0.jsonl.gz
0007


Downloading data: 100%|██████████| 88.5M/88.5M [00:12<00:00, 7.06MB/s]
Generating train split: 1191 examples [00:00, 1615.95 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0007_0.jsonl.gz
0008


Downloading data: 100%|██████████| 143M/143M [00:18<00:00, 7.71MB/s]
Generating train split: 2064 examples [00:01, 1731.50 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0008_0.jsonl.gz
0009


Downloading data: 100%|██████████| 143M/143M [00:20<00:00, 7.06MB/s]
Generating train split: 1987 examples [00:01, 1629.51 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0009_0.jsonl.gz
0010


Downloading data: 100%|██████████| 144M/144M [00:18<00:00, 7.93MB/s]
Generating train split: 1913 examples [00:01, 1548.82 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0010_0.jsonl.gz
0011


Downloading data: 100%|██████████| 143M/143M [00:23<00:00, 6.16MB/s]
Generating train split: 1736 examples [00:01, 1495.27 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0011_0.jsonl.gz
0012


Downloading data: 100%|██████████| 138M/138M [00:17<00:00, 8.06MB/s]
Generating train split: 2410 examples [00:01, 2271.02 examples/s]


Saved ../data/original_dump/jap2010/jap2010_0012_0.jsonl.gz
0013


Downloading data: 100%|██████████| 137M/137M [00:17<00:00, 7.75MB/s]
Generating train split: 3417 examples [00:01, 2973.30 examples/s]


KeyboardInterrupt: 