In [1]:
from src.utils.file_io import read_files, write_files
from src.config import SYSTEM_PROMPT

from collections import defaultdict
from pathlib import Path
from tqdm import tqdm

In [2]:
DATASETS = "InfinityInstruct"
DOMAIN = "General"
VERSION = "v0"
SYSTEM_PROMPT = SYSTEM_PROMPT[DOMAIN]

DATA_FOLDER = Path("../../../data").resolve()
INPUT_FOLDER = DATA_FOLDER / "raw" / DOMAIN / DATASETS
OUTPUT_FOLDER = DATA_FOLDER / "interim"

In [3]:
def process(input_folder, dataset):
    processed_data = defaultdict(lambda: defaultdict(list))

    for file_name, content in read_files(input_folder, file_type="parquet"):
        print(f"Processing file: {file_name}")

        for _, row in tqdm(
            content.iterrows(), total=len(content), desc=f"Processing {file_name}"
        ):
            item = row["conversations"].tolist()
            formatted_item = {"conversations": item, "system": SYSTEM_PROMPT}
            processed_data[DOMAIN][dataset].append(formatted_item)

    write_files(OUTPUT_FOLDER, VERSION, processed_data)

In [4]:
process(INPUT_FOLDER / "3M", DATASETS + "-3M")

Processing file: train-00026-of-00035.parquet


Processing train-00026-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 77388.65it/s]


Processing file: train-00018-of-00035.parquet


Processing train-00018-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78122.75it/s]


Processing file: train-00007-of-00035.parquet


Processing train-00007-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80925.60it/s]


Processing file: train-00013-of-00035.parquet


Processing train-00013-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 77392.78it/s]


Processing file: train-00032-of-00035.parquet


Processing train-00032-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79046.95it/s]


Processing file: train-00008-of-00035.parquet


Processing train-00008-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78938.20it/s]


Processing file: train-00017-of-00035.parquet


Processing train-00017-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78044.95it/s]


Processing file: train-00029-of-00035.parquet


Processing train-00029-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 72762.09it/s]


Processing file: train-00003-of-00035.parquet


Processing train-00003-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 75757.58it/s]


Processing file: train-00022-of-00035.parquet


Processing train-00022-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83911.67it/s]


Processing file: train-00006-of-00035.parquet


Processing train-00006-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82982.14it/s]


Processing file: train-00027-of-00035.parquet


Processing train-00027-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 73773.36it/s]


Processing file: train-00019-of-00035.parquet


Processing train-00019-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83221.61it/s]


Processing file: train-00033-of-00035.parquet


Processing train-00033-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82809.00it/s]


Processing file: train-00012-of-00035.parquet


Processing train-00012-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 72547.76it/s]


Processing file: train-00016-of-00035.parquet


Processing train-00016-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82775.63it/s]


Processing file: train-00028-of-00035.parquet


Processing train-00028-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83717.97it/s]


Processing file: train-00009-of-00035.parquet


Processing train-00009-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 70596.83it/s]


Processing file: train-00023-of-00035.parquet


Processing train-00023-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82455.87it/s]


Processing file: train-00002-of-00035.parquet


Processing train-00002-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83695.04it/s]


Processing file: train-00030-of-00035.parquet


Processing train-00030-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81836.41it/s]


Processing file: train-00011-of-00035.parquet


Processing train-00011-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83411.39it/s]


Processing file: train-00005-of-00035.parquet


Processing train-00005-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 69140.51it/s]


Processing file: train-00024-of-00035.parquet


Processing train-00024-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82688.21it/s]


Processing file: train-00020-of-00035.parquet


Processing train-00020-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82265.26it/s]


Processing file: train-00001-of-00035.parquet


Processing train-00001-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83420.36it/s]


Processing file: train-00015-of-00035.parquet


Processing train-00015-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82451.57it/s]


Processing file: train-00034-of-00035.parquet


Processing train-00034-of-00035.parquet: 100%|██████████| 63473/63473 [00:00<00:00, 79782.65it/s]


Processing file: train-00010-of-00035.parquet


Processing train-00010-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 66383.91it/s]


Processing file: train-00031-of-00035.parquet


Processing train-00031-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82416.19it/s]


Processing file: train-00025-of-00035.parquet


Processing train-00025-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80271.52it/s]


Processing file: train-00004-of-00035.parquet


Processing train-00004-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83315.30it/s]


Processing file: train-00000-of-00035.parquet


Processing train-00000-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82578.03it/s]


Processing file: train-00021-of-00035.parquet


Processing train-00021-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83400.12it/s]


Processing file: train-00014-of-00035.parquet


Processing train-00014-of-00035.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82183.45it/s]
Writing data to /Volumes/study/github/data-juicer/data/interim/v0: 100%|██████████| 3463473/3463473 [00:40<00:00, 85990.83it/s]


In [5]:
process(INPUT_FOLDER / "7M", DATASETS + "-7M")

Processing file: train-00040-of-00075.parquet


Processing train-00040-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79751.75it/s]


Processing file: train-00025-of-00075.parquet


Processing train-00025-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 73140.32it/s]


Processing file: train-00004-of-00075.parquet


Processing train-00004-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76658.95it/s]


Processing file: train-00061-of-00075.parquet


Processing train-00061-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 73176.76it/s]


Processing file: train-00010-of-00075.parquet


Processing train-00010-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78563.91it/s]


Processing file: train-00054-of-00075.parquet


Processing train-00054-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81378.61it/s]


Processing file: train-00031-of-00075.parquet


Processing train-00031-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 77010.76it/s]


Processing file: train-00050-of-00075.parquet


Processing train-00050-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76895.34it/s]


Processing file: train-00035-of-00075.parquet


Processing train-00035-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80059.25it/s]


Processing file: train-00014-of-00075.parquet


Processing train-00014-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 75485.36it/s]


Processing file: train-00071-of-00075.parquet


Processing train-00071-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80373.44it/s]


Processing file: train-00000-of-00075.parquet


Processing train-00000-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81982.61it/s]


Processing file: train-00065-of-00075.parquet


Processing train-00065-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 71283.40it/s]


Processing file: train-00044-of-00075.parquet


Processing train-00044-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82010.77it/s]


Processing file: train-00021-of-00075.parquet


Processing train-00021-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82855.26it/s]


Processing file: train-00060-of-00075.parquet


Processing train-00060-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 62710.94it/s]


Processing file: train-00005-of-00075.parquet


Processing train-00005-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76006.25it/s]


Processing file: train-00024-of-00075.parquet


Processing train-00024-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82640.52it/s]


Processing file: train-00041-of-00075.parquet


Processing train-00041-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83258.94it/s]


Processing file: train-00030-of-00075.parquet


Processing train-00030-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81877.61it/s]


Processing file: train-00055-of-00075.parquet


Processing train-00055-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81971.79it/s]


Processing file: train-00074-of-00075.parquet


Processing train-00074-of-00075.parquet: 100%|██████████| 49106/49106 [00:00<00:00, 82828.73it/s]


Processing file: train-00011-of-00075.parquet


Processing train-00011-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83146.11it/s]


Processing file: train-00070-of-00075.parquet


Processing train-00070-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81939.68it/s]


Processing file: train-00015-of-00075.parquet


Processing train-00015-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80850.59it/s]


Processing file: train-00034-of-00075.parquet


Processing train-00034-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 68611.90it/s]


Processing file: train-00051-of-00075.parquet


Processing train-00051-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83244.89it/s]


Processing file: train-00020-of-00075.parquet


Processing train-00020-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82221.97it/s]


Processing file: train-00045-of-00075.parquet


Processing train-00045-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82581.02it/s]


Processing file: train-00064-of-00075.parquet


Processing train-00064-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81979.79it/s]


Processing file: train-00001-of-00075.parquet


Processing train-00001-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83168.45it/s]


Processing file: train-00033-of-00075.parquet


Processing train-00033-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81246.12it/s]


Processing file: train-00056-of-00075.parquet


Processing train-00056-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 63987.50it/s]


Processing file: train-00068-of-00075.parquet


Processing train-00068-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82382.18it/s]


Processing file: train-00049-of-00075.parquet


Processing train-00049-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80778.30it/s]


Processing file: train-00012-of-00075.parquet


Processing train-00012-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81818.99it/s]


Processing file: train-00063-of-00075.parquet


Processing train-00063-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82293.21it/s]


Processing file: train-00006-of-00075.parquet


Processing train-00006-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82696.08it/s]


Processing file: train-00038-of-00075.parquet


Processing train-00038-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82954.52it/s]


Processing file: train-00019-of-00075.parquet


Processing train-00019-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80722.03it/s]


Processing file: train-00027-of-00075.parquet


Processing train-00027-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 60969.96it/s]


Processing file: train-00042-of-00075.parquet


Processing train-00042-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83050.00it/s]


Processing file: train-00023-of-00075.parquet


Processing train-00023-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81138.14it/s]


Processing file: train-00046-of-00075.parquet


Processing train-00046-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83503.88it/s]


Processing file: train-00067-of-00075.parquet


Processing train-00067-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83198.49it/s]


Processing file: train-00059-of-00075.parquet


Processing train-00059-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81381.72it/s]


Processing file: train-00002-of-00075.parquet


Processing train-00002-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82210.21it/s]


Processing file: train-00073-of-00075.parquet


Processing train-00073-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82179.25it/s]


Processing file: train-00028-of-00075.parquet


Processing train-00028-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81753.11it/s]


Processing file: train-00016-of-00075.parquet


Processing train-00016-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83128.13it/s]


Processing file: train-00037-of-00075.parquet


Processing train-00037-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 57634.66it/s]


Processing file: train-00009-of-00075.parquet


Processing train-00009-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79538.61it/s]


Processing file: train-00052-of-00075.parquet


Processing train-00052-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81486.31it/s]


Processing file: train-00013-of-00075.parquet


Processing train-00013-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82779.42it/s]


Processing file: train-00048-of-00075.parquet


Processing train-00048-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82569.04it/s]


Processing file: train-00057-of-00075.parquet


Processing train-00057-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81005.90it/s]


Processing file: train-00069-of-00075.parquet


Processing train-00069-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82641.82it/s]


Processing file: train-00032-of-00075.parquet


Processing train-00032-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82900.53it/s]


Processing file: train-00043-of-00075.parquet


Processing train-00043-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81739.95it/s]


Processing file: train-00018-of-00075.parquet


Processing train-00018-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82246.72it/s]


Processing file: train-00026-of-00075.parquet


Processing train-00026-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82956.77it/s]


Processing file: train-00007-of-00075.parquet


Processing train-00007-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80699.23it/s]


Processing file: train-00039-of-00075.parquet


Processing train-00039-of-00075.parquet: 100%|██████████| 100000/100000 [00:02<00:00, 48365.35it/s]


Processing file: train-00062-of-00075.parquet


Processing train-00062-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82143.62it/s]


Processing file: train-00003-of-00075.parquet


Processing train-00003-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78487.09it/s]


Processing file: train-00066-of-00075.parquet


Processing train-00066-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82863.71it/s]


Processing file: train-00058-of-00075.parquet


Processing train-00058-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82932.57it/s]


Processing file: train-00047-of-00075.parquet


Processing train-00047-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82168.04it/s]


Processing file: train-00022-of-00075.parquet


Processing train-00022-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81914.51it/s]


Processing file: train-00053-of-00075.parquet


Processing train-00053-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83181.52it/s]


Processing file: train-00036-of-00075.parquet


Processing train-00036-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82646.19it/s]


Processing file: train-00008-of-00075.parquet


Processing train-00008-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79478.81it/s]


Processing file: train-00029-of-00075.parquet


Processing train-00029-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 82522.09it/s]


Processing file: train-00017-of-00075.parquet


Processing train-00017-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81135.51it/s]


Processing file: train-00072-of-00075.parquet


Processing train-00072-of-00075.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80770.94it/s]
Writing data to /Volumes/study/github/data-juicer/data/interim/v0: 100%|██████████| 7449106/7449106 [01:13<00:00, 101019.07it/s]


In [6]:
process(INPUT_FOLDER / "0625", DATASETS + "-0625")

Processing file: train-00004-of-00007.parquet


Processing train-00004-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 83624.40it/s]


Processing file: train-00000-of-00007.parquet


Processing train-00000-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 78606.71it/s]


Processing file: train-00005-of-00007.parquet


Processing train-00005-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79376.16it/s]


Processing file: train-00001-of-00007.parquet


Processing train-00001-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 70733.01it/s]


Processing file: train-00006-of-00007.parquet


Processing train-00006-of-00007.parquet: 100%|██████████| 59808/59808 [00:00<00:00, 72821.41it/s]


Processing file: train-00002-of-00007.parquet


Processing train-00002-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76898.47it/s]


Processing file: train-00003-of-00007.parquet


Processing train-00003-of-00007.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79860.02it/s]
Writing data to /Volumes/study/github/data-juicer/data/interim/v0: 100%|██████████| 659808/659808 [00:11<00:00, 59204.02it/s]


In [7]:
process(INPUT_FOLDER / "Gen", DATASETS + "-Gen")

Processing file: train-00001-of-00015.parquet


Processing train-00001-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79366.34it/s]


Processing file: train-00005-of-00015.parquet


Processing train-00005-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76513.02it/s]


Processing file: train-00011-of-00015.parquet


Processing train-00011-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 75661.17it/s]


Processing file: train-00014-of-00015.parquet


Processing train-00014-of-00015.parquet: 100%|██████████| 56927/56927 [00:00<00:00, 71208.41it/s]


Processing file: train-00000-of-00015.parquet


Processing train-00000-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76900.46it/s]


Processing file: train-00004-of-00015.parquet


Processing train-00004-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 76191.94it/s]


Processing file: train-00010-of-00015.parquet


Processing train-00010-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80879.06it/s]


Processing file: train-00003-of-00015.parquet


Processing train-00003-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 75813.01it/s]


Processing file: train-00008-of-00015.parquet


Processing train-00008-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 79483.55it/s]


Processing file: train-00013-of-00015.parquet


Processing train-00013-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 73504.38it/s]


Processing file: train-00007-of-00015.parquet


Processing train-00007-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 81397.83it/s]


Processing file: train-00002-of-00015.parquet


Processing train-00002-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 73289.02it/s]


Processing file: train-00009-of-00015.parquet


Processing train-00009-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80732.66it/s]


Processing file: train-00012-of-00015.parquet


Processing train-00012-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 80564.46it/s]


Processing file: train-00006-of-00015.parquet


Processing train-00006-of-00015.parquet: 100%|██████████| 100000/100000 [00:01<00:00, 70994.55it/s]
Writing data to /Volumes/study/github/data-juicer/data/interim/v0: 100%|██████████| 1456927/1456927 [00:23<00:00, 60935.27it/s]
