In [8]:
import re
import os
import random
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

sys.path.append(os.path.abspath(".."))
from tools.string_utils import read_text_file
from tools.json_utils import load_json, save_json
from tools.api import call_api

In [9]:
TEMPERATURE = float(os.getenv("TEMPERATURE", 0.6))
REPHRASE_GENERATOR_PART_INPUT_PATH = '../data/final_answer_generated_rephrased.json'
REPHRASE_GENERATOR_PART_PROMPT_PATH = '../prompts/rephrase_generator_part_content.txt'
REPHRASE_GENERATOR_PART_OUTPUT_PATH = '../data/final_answer_generated_rephrased_part.json'

REPHRASE_GENERATOR_MAX_GEN_TIMES = 100
REPHRASE_GENERATOR_NUM_WORKERS = 4
save_interval = 10

inputs = load_json(REPHRASE_GENERATOR_PART_INPUT_PATH)
rephrase_generator_prompt = read_text_file(REPHRASE_GENERATOR_PART_PROMPT_PATH)

Loaded 10 items from ../data/final_answer_generated_rephrased.json


In [10]:
def expand_numbers_and_ranges(numbers_and_ranges):
    expanded_numbers = []
    for item in numbers_and_ranges:
        if '-' in item:  # It's a range like 'xx1-xx2'
            start, end = map(int, item.split('-'))
            if start > end:
                start, end = end, start
            expanded_numbers.extend(range(start, end + 1))
        else:  # It's a single number
            expanded_numbers.append(int(item))
    expanded_numbers = list(sorted(list(set(expanded_numbers))))
    return expanded_numbers


def parse_transformations(text):
    # Regex pattern to match each transformation block
    pattern = re.compile(r'''
        ^\s*(\d+)\.\s*<transformed-action>(.*?)</transformed-action>\s*  # Capture the transformation type
        <transformed-explanation>(.*?)</transformed-explanation>\s*  # Capture the explanation
        <transformed-question>(.*?)</transformed-question>\s*        # Capture the result
        <transformed-answer>(.*?)</transformed-answer>               # Capture the answer
        ''', re.MULTILINE | re.DOTALL | re.VERBOSE)
    
    transformations = []
    
    # Find all matches in the text
    matches = pattern.findall(text)
    
    for match in matches:
        transformation = {
            'transformation': match[1].strip(),
            'explanation': match[2].strip(),
            'result': match[3].strip(),
            'answer': match[4].strip()
        }
        transformations.append(transformation)
    
    return transformations

def run():
    all_num, success_num = 0, 0
    tasks = []

    with ThreadPoolExecutor(max_workers=REPHRASE_GENERATOR_NUM_WORKERS) as executor:
        for i, cur_input in enumerate(inputs[:REPHRASE_GENERATOR_MAX_GEN_TIMES]):
            questions = cur_input['proposed-questions']
            objective_facts = cur_input['objective-facts']

            for proposed_question_type, proposed_question_dict in questions.items():
                if 'rephrased-questions-part' in proposed_question_dict and proposed_question_dict['rephrased-questions-part']:
                    continue
                needed_objective_fact_ids = proposed_question_dict['objective-facts']
                needed_objective_fact_ids = re.findall(r'\d+-\d+|\d+', needed_objective_fact_ids)
                needed_objective_fact_ids = expand_numbers_and_ranges(needed_objective_fact_ids)
                needed_objective_factid_2_fact = {idx: objective_facts[idx-1] for idx in needed_objective_fact_ids if idx <= len(objective_facts)}

                context = ""
                # context = "Given clues:\n"
                # for idx, clue in needed_objective_factid_2_fact.items():
                #     context += f"{idx}. {clue}\n"
                # context += "\n"
                context += f"Original Question: {proposed_question_dict['question']}\n"
                context += f"Answer: {proposed_question_dict['positive']}\n"
                context += "\n"

                cur_rephrase_generator_prompt = rephrase_generator_prompt.replace('[[CONTEXT]]', context)
                future = executor.submit(call_api, cur_rephrase_generator_prompt, TEMPERATURE)
                tasks.append((future, proposed_question_dict))

        all_num = len(tasks)
        for future_info in tqdm(as_completed([t[0] for t in tasks]), total=len(tasks), desc="Generating", dynamic_ncols=True):
            future = future_info
            idx = [t[0] for t in tasks].index(future)
            if idx == -1:
                raise ValueError("Invalid index.")
            proposed_question_dict = tasks[idx][1]
            try:
                rephrase_generator_response = future.result(timeout=10*60)
                rephrased_questions = parse_transformations(rephrase_generator_response)
                
                if rephrased_questions:
                    proposed_question_dict['rephrased-questions-part'] = rephrased_questions  # inplace update
                    success_num += 1
                    if success_num % save_interval == 0:
                        print(f'Saving {success_num}/{all_num} outputs to {REPHRASE_GENERATOR_PART_OUTPUT_PATH}.')
                        save_json(inputs,REPHRASE_GENERATOR_PART_OUTPUT_PATH)
            except Exception as e:
                print(f"Error processing question: {e}")

    if success_num or not os.path.exists(REPHRASE_GENERATOR_PART_OUTPUT_PATH):
        print(f'Saving {success_num}/{all_num} outputs to outputs to {REPHRASE_GENERATOR_PART_OUTPUT_PATH}.')
        save_json(inputs,REPHRASE_GENERATOR_PART_OUTPUT_PATH)

    return success_num, all_num

In [11]:
run()

Generating:  33%|███▎      | 10/30 [00:43<01:18,  3.93s/it]

Saving 10/30 outputs to ../data/final_answer_generated_rephrased_part.json.
Saved 10 items to ../data/final_answer_generated_rephrased_part.json


Generating:  67%|██████▋   | 20/30 [01:26<00:32,  3.29s/it]

Saving 20/30 outputs to ../data/final_answer_generated_rephrased_part.json.
Saved 10 items to ../data/final_answer_generated_rephrased_part.json


Generating: 100%|██████████| 30/30 [02:11<00:00,  4.40s/it]

Saving 30/30 outputs to ../data/final_answer_generated_rephrased_part.json.
Saved 10 items to ../data/final_answer_generated_rephrased_part.json
Saving 30/30 outputs to outputs to ../data/final_answer_generated_rephrased_part.json.
Saved 10 items to ../data/final_answer_generated_rephrased_part.json





(30, 30)