In [9]:
import re
import os
import random
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

sys.path.append(os.path.abspath(".."))
from tools.string_utils import read_text_file
from tools.json_utils import load_json, save_json
from tools.api import call_api

In [10]:
PARAPHRASE_INPUT_PATH  = '../data/final_answer_generated_rephrased.json'
PARAPHRASE_OUTPUT_PATH = '../data/final_answer_generated_rephrased_paraphrased.json'
PARAPHRASE_PROMPT_PATH = '../prompts/paraphrase.txt'

SENTENCE_ORDER_CHANGER_NUM_WORKERS = 4
SENTENCE_ORDER_CHANGER_MAX_GEN_TIMES = 100
TEMPERATURE = float(os.getenv("TEMPERATURE", 0.6))

save_interval = 10

inputs = load_json(PARAPHRASE_INPUT_PATH)
sentence_changer_prompt = read_text_file(PARAPHRASE_PROMPT_PATH)

Loaded 10 items from ../data/final_answer_generated_rephrased.json


In [11]:
def expand_numbers_and_ranges(numbers_and_ranges):
    expanded_numbers = []
    for item in numbers_and_ranges:
        if '-' in item:  # It's a range like 'xx1-xx2'
            start, end = map(int, item.split('-'))
            if start > end:
                start, end = end, start
            expanded_numbers.extend(range(start, end + 1))
        else:  # It's a single number
            expanded_numbers.append(int(item))
    expanded_numbers = list(sorted(list(set(expanded_numbers))))
    return expanded_numbers

def parse_transformations(text):
    """
    Parses the transformed questions and their metadata from the provided text.

    Args:
        text (str): The input text containing the transformed questions.

    Returns:
        list of dict: A list of dictionaries, each containing the question number,
                      transformed question, and the same-meaning-with-origin value.
    """
    # Regex pattern to match each transformation block
    pattern = re.compile(r'''
        \s*<question-\d+>                               # Match the question number and opening tag
        \s*<transformed-question>(.*?)</transformed-question> # Capture the transformed question
        \s*<same-meaning-with-origin>(True|False)</same-meaning-with-origin> # Capture the boolean value
        \s*</question-\d+>                                   # Match the closing tag
        ''', re.DOTALL | re.VERBOSE)

    transformations = []

    # Find all matches in the text
    for match in pattern.finditer(text):
        transformed_question = match.group(1).strip()
        same_meaning = match.group(2).strip() == 'True'
        
        # Extract the question number from the matched string
        question_tag = re.search(r'<question-(\d+)>', match.group(0))
        question_number = int(question_tag.group(1)) if question_tag else None

        transformations.append({
            'question_number': question_number,
            'transformed_question': transformed_question,
            'same_meaning_with_origin': same_meaning
        })

    return transformations

def run():
    all_num, success_num = 0, 0
    tasks = []

    with ThreadPoolExecutor(max_workers=SENTENCE_ORDER_CHANGER_NUM_WORKERS) as executor:
        for i, cur_input in enumerate(inputs[:SENTENCE_ORDER_CHANGER_MAX_GEN_TIMES]):

            questions = cur_input['proposed-questions']
            objective_facts = cur_input['objective-facts']

            for proposed_question_type, proposed_question_dict in questions.items():

                context = "Given clues:\n"
                for idx, clue in enumerate(objective_facts, start=1):
                    context += f"{idx}. {clue}\n"
                context += "\n"
                context += f"Questions and Answers: \n"
                if 'rephrased-questions' in proposed_question_dict:
                    rephrased_questions = proposed_question_dict['rephrased-questions']
                    already_processed = False
                    for j, cur_rephrased_question in enumerate(rephrased_questions, start=1):
                        if 'reordered-question' in cur_rephrased_question:
                            already_processed = True
                            break
                        context += f"<question-{j}>{cur_rephrased_question['result']}</question-{j}>\n"
                        context += f"<answer-{j}>{cur_rephrased_question['answer']}</answer-{j}>\n"
                    if not already_processed:
                        context += "\n"
                        cur_sentence_changer_prompt = sentence_changer_prompt.replace('[[CONTEXT]]', context)
                        future = executor.submit(call_api, cur_sentence_changer_prompt, TEMPERATURE)
                        tasks.append((future, rephrased_questions))

                # context = "Given clues:\n"
                # for idx, clue in enumerate(objective_facts, start=1):
                #     context += f"{idx}. {clue}\n"
                # context += "\n"
                # context += f"Questions and Answers: \n"
                # if 'rephrased-questions-part' in proposed_question_dict:
                #     rephrased_questions_part = proposed_question_dict['rephrased-questions-part']
                #     already_processed = False
                #     for j, cur_rephrased_question_part in enumerate(rephrased_questions_part, start=1):
                #         if 'reordered-question' in cur_rephrased_question_part:
                #             already_processed = True
                #             break
                #         context += f"<question-{j}>{cur_rephrased_question_part['result']}</question-{j}>\n"
                #         context += f"<answer-{j}>{cur_rephrased_question_part['answer']}</answer-{j}>\n"
                #     if not already_processed:
                #         context += "\n"
                #         cur_sentence_changer_prompt = sentence_changer_prompt.replace('[[CONTEXT]]', context)
                #         future = executor.submit(call_api, cur_sentence_changer_prompt, TEMPERATURE)
                #         tasks.append((future, rephrased_questions_part))

                # context = "Given clues:\n"
                # for idx, clue in enumerate(objective_facts, start=1):
                #     context += f"{idx}. {clue}\n"
                # context += "\n"
                # context += f"Questions and Answers: \n"
                # if 'rephrased-questions-hybrid' in proposed_question_dict:
                #     rephrased_questions_hybrid = proposed_question_dict['rephrased-questions-hybrid']
                #     already_processed = False
                #     for j, cur_rephrased_question_hybrid in enumerate(rephrased_questions_hybrid, start=1):
                #         if 'reordered-question' in cur_rephrased_question_hybrid:
                #             already_processed = True
                #             break
                #         context += f"<question-{j}>{cur_rephrased_question_hybrid['result']}</question-{j}>\n"
                #         context += f"<answer-{j}>{cur_rephrased_question_hybrid['answer']}</answer-{j}>\n"
                #     if not already_processed:
                #         context += "\n"
                #         cur_sentence_changer_prompt = sentence_changer_prompt.replace('[[CONTEXT]]', context)
                #         future = executor.submit(self.openai_model.generate, self.CLIENT, cur_sentence_changer_prompt, TEMPERATURE)
                #         tasks.append((future, rephrased_questions_hybrid))

        all_num = len(tasks)
        for future_info in tqdm(as_completed([t[0] for t in tasks]), total=len(tasks), desc="Generating", dynamic_ncols=True):
            future = future_info
            idx = [t[0] for t in tasks].index(future)
            if idx == -1:
                raise ValueError("Invalid index.")
            rephrased_questions = tasks[idx][1]

            sentence_changer_response = future.result(timeout=10*60)
            reordered_questions = parse_transformations(sentence_changer_response)
            reordered_question_index2dict = {
                reordered_question['question_number']: reordered_question
                for reordered_question in reordered_questions
            }

            for j, cur_rephrased_question_part in enumerate(rephrased_questions, start=1):
                if j not in reordered_question_index2dict:
                    continue
                cur_rephrased_question_part['paraphrased-question'] = reordered_question_index2dict[j]['transformed_question']
                cur_rephrased_question_part['paraphrased-same-meaning-with-origin-llmCheck'] = reordered_question_index2dict[j]['same_meaning_with_origin']
            
            success_num += 1
            if success_num % save_interval == 0:
                print(f'Saving {success_num}/{all_num} outputs to {PARAPHRASE_OUTPUT_PATH}.')
                save_json(inputs,PARAPHRASE_OUTPUT_PATH)

    if success_num or not os.path.exists(PARAPHRASE_OUTPUT_PATH):
        print(f'Saving {success_num}/{all_num} outputs to {PARAPHRASE_OUTPUT_PATH}.')
        save_json(inputs,PARAPHRASE_OUTPUT_PATH)

    return success_num, all_num

In [12]:
run()

Generating:  33%|███▎      | 10/30 [00:17<00:29,  1.46s/it]

Saving 10/30 outputs to ../data/final_answer_generated_rephrased_paraphrased.json.
Saved 10 items to ../data/final_answer_generated_rephrased_paraphrased.json


Generating:  67%|██████▋   | 20/30 [00:32<00:13,  1.32s/it]

Saving 20/30 outputs to ../data/final_answer_generated_rephrased_paraphrased.json.
Saved 10 items to ../data/final_answer_generated_rephrased_paraphrased.json


Generating: 100%|██████████| 30/30 [00:48<00:00,  1.62s/it]

Saving 30/30 outputs to ../data/final_answer_generated_rephrased_paraphrased.json.
Saved 10 items to ../data/final_answer_generated_rephrased_paraphrased.json
Saving 30/30 outputs to ../data/final_answer_generated_rephrased_paraphrased.json.
Saved 10 items to ../data/final_answer_generated_rephrased_paraphrased.json





(30, 30)