In [None]:
import json

In [5]:
import json

def create_new_prompts(input_file, output_file):
    """
    Reads the GSM8K JSONL file, creates new prompt strings,
    extracts the final answer from the answer field,
    converts it to float, and writes the list of dicts to an output JSON file.
    
    Each dict has two keys:
      - "prompt": a string in the format "Q: {question}\nA: "
      - "answer": the final answer extracted from the answer field (as a float)
    
    The final answer is assumed to be the part of the answer string after "####".
    
    Args:
        input_file (str): Path to the GSM8K testing dataset in JSONL format.
        output_file (str): Path to save the new list of prompt-answer dicts.
    """
    new_entries = []
    
    # Open and read the dataset line by line
    with open(input_file, 'r') as f:
        for line in f:
            data = json.loads(line)
            # Extract the question and answer strings
            prompt_text = data.get("question")
            answer_text = data.get("answer")
            if prompt_text and answer_text:
                # Extract the final answer part after "####"
                if "####" in answer_text:
                    final_answer_str = answer_text.split("####")[-1].strip()
                else:
                    final_answer_str = answer_text.strip()
                
                try:
                    # Convert the extracted answer to float
                    final_answer_float = float(final_answer_str)
                except ValueError:
                    # Skip this entry if conversion fails
                    continue
                
                entry = {
                    "prompt": "Q: " + prompt_text + "\nA: ",
                    "answer": final_answer_float
                }
                new_entries.append(entry)
    
    # Save the list of prompt-answer dicts to a JSON file
    with open(output_file, 'w') as f:
        json.dump(new_entries, f, indent=2)
    
    print(f"Saved {len(new_entries)} new prompt-answer entries to {output_file}")


In [6]:
def load_prompts(file_path):
    """
    Loads the list of prompt strings from a JSON file.
    
    Args:
        file_path (str): Path to the JSON file containing the prompt strings.
    
    Returns:
        list: The list of prompt strings.
    """
    with open(file_path, 'r') as f:
        prompts = json.load(f)
    return prompts

In [7]:
# Paths to the input and output files
input_file = "Raw_Dataset.json"   # Ensure this file exists in your working directory.
output_file = "Processed_Prompts.json"

# Create new prompts and save to a file
create_new_prompts(input_file, output_file)

Saved 1305 new prompt-answer entries to Processed_Prompts.json


In [8]:
# Load back the list to verify
loaded_prompts = load_prompts(output_file)
print(f"Loaded {len(loaded_prompts)} prompts from {output_file}")

Loaded 1305 prompts from Processed_Prompts.json
