In [1]:
import sys
import os
import re

sys.path.append(os.path.abspath(".."))
from tools.api import call_api
from tools.json_utils import save_json, load_json
from tools.string_utils import read_text_file
from components.fact_extractor import process_input

ImportError: attempted relative import beyond top-level package

## 1. Data preprocessing

In [2]:
chunk_contents = load_json('data/chunked_questions.json')
chunk_contents

Loaded 10 items from data/chunked_questions.json


[{'id': 'doc1_question1',
  'question': '1) Can workers opt out of the savings program?',
  'origin_context': '1) Can workers opt out of the savings program? The savings program is a structured initiative designed to provide financial support during emergencies, ensuring workers maintain financial stability and security. Participation is strongly encouraged, as consistent savings contribute to long-term financial well-being.',
  'context': 'The savings program is a structured initiative designed to provide financial support during emergencies, ensuring workers maintain financial stability and security [Sen 1]. Participation is strongly encouraged, as consistent savings contribute to long-term financial well-being [Sen 2].'},
 {'id': 'doc1_question2',
  'question': '2) Are workers allowed to smoke in hostels?',
  'origin_context': '2) Are workers allowed to smoke in hostels? Smoking policies in hostels are typically strict, with many accommodations prohibiting smoking indoors to ensure 

In [None]:
read_text_file('prompts/fact_extractor.txt')
temperature_fact_extractor = 0.9

def process_input(cur_input, fact_extractor_prompt, i):
        try:
            context = cur_input['context']
            cur_fact_extractor_prompt = fact_extractor_prompt.replace('[[CONTEXT]]', context)
            fact_extractor_response = call_api(cur_fact_extractor_prompt,temperature=temperature_fact_extractor)
            objective_facts, sens = extract_objective_facts(fact_extractor_response)

            result = {
                **cur_input,
                'objective-facts': objective_facts,
                'sens': sens
            }
            return result, i
        except Exception as e:
            print(f"An error occurred while processing input {cur_input.get('id', 'unknown id')}: {e}")
            return None, None  # or you can return an error result


def extract_objective_facts(text):
    """
    Extracts objective facts and their referenced sentence numbers.

    Parameters:
        text (str): The input text content.

    Returns:
        tuple: A tuple containing two lists.
            - objective_facts: A list of detailed descriptions of the objective facts.
            - sen_numbers: A list of sentence numbers as a formatted string corresponding to each objective fact.
    """
    # Regex pattern to match <detailed-desc> and <sentences-used> blocks
    pattern = r'<detailed-desc>(.*?)</detailed-desc>\s*<sentences-used>\[Sen\s*([^\]]+)\]</sentences-used>'
    
    # Use re.findall to extract all matches
    matches = re.findall(pattern, text, re.DOTALL)
    
    objective_facts = []
    sen_numbers = []

    for desc, sensors in matches:
        # Append detailed description to the objective_facts list
        objective_facts.append(desc.strip())
        
        # Extract all numbers using regex
        numbers = [int(num) for num in re.findall(r'\d+', sensors)]
        # Sort numbers to ensure the ranges are correctly identified
        numbers.sort()
        
        # Process the numbers to detect ranges
        formatted_sens = []
        i = 0
        while i < len(numbers):
            start = numbers[i]
            while i < len(numbers) - 1 and numbers[i] + 1 == numbers[i + 1]:
                i += 1
            end = numbers[i]
            if start == end:
                formatted_sens.append(f"{start}")
            else:
                formatted_sens.append(f"{start}-{end}")
            i += 1
        
        # Create the formatted string
        sen_string = f"{','.join(formatted_sens)}"
        sen_numbers.append(sen_string)
    
    return objective_facts, sen_numbers


In [18]:
fact_extractor_prompt = read_text_file('prompts/fact_extractor.txt')
chunked_questions = load_json('data/chunked_questions.json')

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

all_num, success_num = 0, 0
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for i, cur_input in enumerate(chunked_questions):
        if 'objective-facts' not in cur_input:
            futures.append(executor.submit(process_input, cur_input, fact_extractor_prompt, i))

    all_num = len(futures)
    for future in tqdm(as_completed(futures), total=len(futures), dynamic_ncols=True):
        result, i = future.result(timeout=10*60)
        if result != None:
            chunked_questions[i] = result
            success_num += 1

Loaded 10 items from data/chunked_questions.json


100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


In [20]:
save_json(chunked_questions, 'fact_extracted.json')

Saved 10 items to fact_extracted.json
