In [2]:
import os
import glob
import json
import re

def process_json_line(line, max_length=500):
    """
    Process a JSON line to meet the specified requirements.

    Args:
        line (str): A line from the JSONL file.
        max_length (int): Maximum length of the "code" field.

    Returns:
        dict or None: Processed JSON object if it meets requirements, None otherwise.
    """
    try:
        data = json.loads(line)

        # 1. Only use examples where score is True
        if not data.get("score", [False])[0]:
            return None

        # 2. Cut out thoughts after the last \boxed
        if "code" in data and isinstance(data["code"], list) and len(data["code"]) > 0:
            code_str = data["code"][0]

            # Use regex to find all occurrences of \boxed{...}
            boxed_matches = list(re.finditer(r"\\boxed\{.*?\}\$", code_str))
            if boxed_matches and len(boxed_matches) > 3:
                last_boxed = boxed_matches[2]
                code_str = code_str[:last_boxed.end() + 1]  # Slice up to the end of the 3rd \boxed{...}
            elif boxed_matches:
                # Get the position after the last match
                last_boxed = boxed_matches[-1]
                code_str = code_str[:last_boxed.end() + 1]  # Slice up to the end of the last \boxed{...}
            else:
                return None
            
            data["code"] = code_str  # Convert code list[0] to string
        else:
            return None

        # 3. Ensure "code" is < max_length and has boxed answer
        if len(data["code"]) > max_length:
            data["code"] = data["code"][:max_length]
            
            if "\\boxed{" not in data["code"]:
                return None

        # 4. Add "system" key
        data["system"] = "Please reason step by step, and put your final answer within \\boxed{{}}."

        return data
    except json.JSONDecodeError as e:
        print(f"Skipping invalid JSON line: {e}")
        return None

def combine_jsonl_files(input_dirs, output_file, patterns, max_thought_lengths):
    # Open the output file in write mode
    with open(output_file, 'w', encoding='utf-8') as outfile:
        # Iterate over each pattern
        for input_dir in input_dirs:
            for pattern, max_length in zip(patterns, max_thought_lengths):
                # Construct the full search pattern
                search_pattern = os.path.join(input_dir, pattern)
                # Use glob to find all files matching the pattern
                files = glob.glob(search_pattern)
                files.sort()  # Optional: sort files for consistent ordering

                if not files:
                    continue

                # Iterate through each file and write its contents to the output file
                for file_path in files:
                    with open(file_path, 'r', encoding='utf-8') as infile:
                        for line in infile:
                            processed_data = process_json_line(line, max_length)
                            if processed_data:
                                outfile.write(json.dumps(processed_data) + '\n')
    
    print(f"All files have been combined into {output_file}")

def generate_patterns(thought_sizes, max_thought_length, batch_start=0, batch_end=7500, step=500):
    patterns = []
    max_thought_lengths = []
    for thought in thought_sizes:
        for start in range(batch_start, batch_end, step):
            end = start + step
            pattern = f"train_qwen25-math-cot_-1_seed0_t1.0_thoughts{thought}_data_collection_s{start}_e{end}.jsonl"
            patterns.append(pattern)
            max_thought_lengths.append(thought * max_thought_length)
    return patterns, max_thought_lengths

batch_sizes = [16, 32, 64, 128]
output_dir = "llama-3-1-8b-instruct/math_eval/math_thoughts"
output_file = "llama-3-1-8b-instruct/math_eval/math_thoughts/combined_thoughts.jsonl"
input_dirs = ["llama-3-1-8b-instruct/math_eval/math", "llama-3-1-8b-instruct/math_eval/math/no_planner_data_collection"]
max_thought_length = 500
patterns, max_thought_lengths = generate_patterns(batch_sizes, max_thought_length)
os.makedirs(output_dir, exist_ok=True)
combine_jsonl_files(input_dirs, output_file, patterns, max_thought_lengths)

All files have been combined into llama-3-1-8b-instruct/math_eval/math_thoughts/combined_thoughts.jsonl


In [3]:
batch_sizes = [128]
output_dir = "llama-3-1-8b-instruct/math_eval/math_thoughts"
output_file = "llama-3-1-8b-instruct/math_eval/math_thoughts/combined_thoughts_128_only.jsonl"
input_dirs = ["llama-3-1-8b-instruct/math_eval/math", "llama-3-1-8b-instruct/math_eval/math/no_planner_data_collection"]
max_thought_length = 500
patterns, max_thought_lengths = generate_patterns(batch_sizes, max_thought_length)
os.makedirs(output_dir, exist_ok=True)
combine_jsonl_files(input_dirs, output_file, patterns, max_thought_lengths)

All files have been combined into llama-3-1-8b-instruct/math_eval/math_thoughts/combined_thoughts_128_only.jsonl
