In [1]:
from src.utils.file_io import read_files, write_files
from src.utils.conv_formatters import format_multi_turn_conv
from src.config import SYSTEM_PROMPT

from collections import defaultdict
from pathlib import Path
from tqdm import tqdm

In [2]:
DATASET = "tulu-sft"
DOMAIN = "General"
VERSION = "v0"
SYSTEM_PROMPT = SYSTEM_PROMPT[DOMAIN]

DATA_FOLDER = Path("../../../data").resolve()
INPUT_FOLDER = DATA_FOLDER / "raw/" / DOMAIN / DATASET
OUTPUT_FOLDER = DATA_FOLDER / "interim"

In [3]:
processed_data = defaultdict(lambda: defaultdict(list))

error_cnt = 0
for file_name, content in read_files(INPUT_FOLDER, file_type="parquet"):
    print(f"Processing file: {file_name}")

    for _, row in tqdm(
        content.iterrows(), total=len(content), desc=f"Processing {file_name}"
    ):
        try:
            formatted_item = format_multi_turn_conv(row["messages"])
            if "system" not in formatted_item.keys():
                formatted_item["system"] = SYSTEM_PROMPT
            processed_data[DOMAIN][DATASET].append(formatted_item)
        except Exception as e:
            error_cnt += 1

print(f"Total errors: {error_cnt}")

Processing file: train-00000-of-00006.parquet


Processing train-00000-of-00006.parquet: 100%|██████████| 156558/156558 [00:02<00:00, 62808.24it/s]


Processing file: train-00004-of-00006.parquet


Processing train-00004-of-00006.parquet: 100%|██████████| 156557/156557 [00:02<00:00, 67497.91it/s]


Processing file: train-00001-of-00006.parquet


Processing train-00001-of-00006.parquet: 100%|██████████| 156557/156557 [00:02<00:00, 63967.13it/s]


Processing file: train-00005-of-00006.parquet


Processing train-00005-of-00006.parquet: 100%|██████████| 156557/156557 [00:02<00:00, 67142.77it/s]


Processing file: train-00002-of-00006.parquet


Processing train-00002-of-00006.parquet: 100%|██████████| 156557/156557 [00:02<00:00, 67196.98it/s]


Processing file: train-00003-of-00006.parquet


Processing train-00003-of-00006.parquet: 100%|██████████| 156557/156557 [00:02<00:00, 72566.15it/s]

Total errors: 751





In [4]:
write_files(OUTPUT_FOLDER, VERSION, processed_data)

Writing data to /Volumes/study/github/data-juicer/data/interim/v0: 100%|██████████| 938592/938592 [00:13<00:00, 69449.91it/s] 
