### Data Re-batching
During the crawling process, discrepancies in line counts per file may occur due to calculation errors or session timeouts. This script includes a Re-batching utility that:
* **Aggregates:**      Reads all inconsistent output files from the raw input directory.
* **Standardizes:**    Re-partitions the entire dataset into uniform files with a fixed line count (e.g., 500,000 lines per file).
* **Reliability:**     Ensures downstream processing (like model training or indexing) receives consistent data segments.

In [65]:
import os

In [66]:
ID_LIST_LABEL = ["movie", "tv_series", "person"]                    # category list
IDX_LABEL = 1                                                       # Value in 0 1 2
INPUT_FOLDER = "12_31_2025"                                         # folder have input file (.jsonl)
OUTPUT_FOLDER = "legit"                                             # folder to save output file
MAX_LINE_BATCH = 500000                                             # max line per new file
PREFIX_OUTPUT_FILE = f"{ID_LIST_LABEL[IDX_LABEL]}_{INPUT_FOLDER}"   # prefix of new file name, change: ["movie", "person", "tv_series"]

In [67]:
list_file = sorted(os.listdir(INPUT_FOLDER))
print(list_file)

['movie_12_31_2025_01.jsonl', 'movie_12_31_2025_02.jsonl', 'movie_12_31_2025_06.jsonl', 'movie_12_31_2025_09.jsonl', 'movie_12_31_2025_12.jsonl']


In [68]:
def count_lines(file_path):
    with open(file_path, "rb") as f:
        return sum(1 for _ in f)

In [69]:
line_count = 0
idx_batch = 1
f_out = None

try:
    for input_file_name in list_file:
        input_path = os.path.join(INPUT_FOLDER, input_file_name)

        with open(input_path, "r", encoding="utf-8") as f_in:
            for line in f_in:
                if line.strip():
                    if f_out is None or line_count >= MAX_LINE_BATCH:
                        if f_out:
                            f_out.close()
                        output_path = os.path.join(OUTPUT_FOLDER, f"{PREFIX_OUTPUT_FILE}_{idx_batch}.jsonl")
                        f_out = open(output_path, "w", encoding="utf-8")

                        print(f"Creating batch {idx_batch}...")
                        idx_batch += 1
                        line_count = 0

                    f_out.write(line)
                    line_count += 1
finally:
    if f_out:
        f_out.close()

Creating batch 1...
Creating batch 2...
Creating batch 3...


In [70]:
# Check number of lines per new file
list_output_file = sorted(os.listdir(OUTPUT_FOLDER))
print(list_output_file)

for file in list_output_file:
    print(f"{file}: {count_lines(os.path.join(OUTPUT_FOLDER, file))}")

['movie_12_31_2025_1.jsonl', 'movie_12_31_2025_2.jsonl', 'movie_12_31_2025_3.jsonl']
movie_12_31_2025_1.jsonl: 500000
movie_12_31_2025_2.jsonl: 500000
movie_12_31_2025_3.jsonl: 144617
