In [1]:
import sys
import os

import re
from pathlib import Path



sys.path.append(os.path.abspath(".."))
from tools.api import call_api
from tools.json_utils import save_json, load_json
from tools.string_utils import extract_text_from_pdf, read_text_file

## 1. Data preprocessing

In [None]:
# data_cleaning_prompt = read_text_file("prompts/data_cleaning.txt")
# pdf_path = "../data/sample_data.pdf"

# pdf_text = extract_text_from_pdf(pdf_path)
# cleaned_pdf_text = call_api(data_cleaning_prompt + pdf_text)

prompt = 
You are tasked with cleaning and reformatting a document while keeping all the content 100% the same. 
Follow these instructions exactly: 
1.Remove all unnecessary line breaks (\n) and extra spaces so that the text reads as continuous paragraphs. 
2.Remove all bullet points. Do not change the content, wording, or meaning in any way. The only exception is that bullet points can be replaced with proper paragraph formatting or 
inline lists if needed, but the information must remain unchanged.
3.Retain their numbering, such as 1) 2) and etc
4.Add missing punctuations.



In [26]:
def split_by_questions(text):
    """
    Split into blocks using Q markers like "1)", "2)", ..., "10)"
    """
    pattern = r"(?m)^(?:\d{1,2}\))"
    matches = list(re.finditer(pattern, text))

    chunks = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chunk = text[start:end].strip()
        chunks.append(chunk)

    return chunks

def process_chunk_text(chunk, counter):
    lines = []
    for line in chunk.split('\n'):
        sentences = re.split(r'(?<=[.!?])\s+', line)
        new_line = []

        for sent in sentences:
            sent = sent.strip()
            if not sent:
                continue

            # Tag each sentence
            if sent[-1] in ".!?":
                new_line.append(f"{sent[:-1]} [Sen {counter}]{sent[-1]}")
            else:
                new_line.append(f"{sent} [Sen {counter}]")

            counter += 1

        lines.append(" ".join(new_line))

    return "\n".join(lines), counter


In [None]:
#question_set = split_by_questions(cleaned_pdf_text)

#save_json(question_set, 'data/cleaned_questions.json')
# cleaned_questions = load_json('data/cleaned_questions.json')

# counter = 1
# question_no = 1
# chunk_contents = []

# for question in cleaned_questions:
#     q,count = process_chunk_text(question,counter)
#     chunk_contents.append(
#             {
#                 "id": f"doc1_question{question_no}",
#                 "origin_context": question,
#                 "context": q
#             }
#         )
#     question_no += 1

# save_json(chunk_contents, 'data/chunked_questions.json')

chunk_contents = load_json('data/chunked_questions.json')
chunk_contents

Loaded 10 items from data/chunked_questions.json


[{'id': 'doc1_question1',
  'origin_context': '1) Can workers opt out of the savings program? The savings program is a structured initiative designed to provide financial support during emergencies, ensuring workers maintain financial stability and security. Participation is strongly encouraged, as consistent savings contribute to long-term financial well-being.',
  'context': '1) Can workers opt out of the savings program [Sen 1]? The savings program is a structured initiative designed to provide financial support during emergencies, ensuring workers maintain financial stability and security [Sen 2]. Participation is strongly encouraged, as consistent savings contribute to long-term financial well-being [Sen 3].'},
 {'id': 'doc1_question2',
  'origin_context': '2) Are workers allowed to smoke in hostels? Smoking policies in hostels are typically strict, with many accommodations prohibiting smoking indoors to ensure a safe and healthy environment. Workers are advised to follow hostel r

In [None]:
read_text_file('prompts/fact_extractor.txt')


# set the temperature
# TEMPERATURE = float(os.getenv("TEMPERATURE", 0.6))

def process_input(cur_input, fact_extractor_prompt, i):
    try:
        context = cur_input['context']
        cur_fact_extractor_prompt = fact_extractor_prompt.replace('[[CONTEXT]]', context)

        api_result = call_api(cur_fact_extractor_prompt, temperature=0.6)
        print("call_api returned:", type(api_result), api_result)

        fact_extractor_response, _ = api_result
        extracted = extract_objective_facts(fact_extractor_response)
        print("extract_objective_facts returned:", type(extracted), extracted)

        objective_facts, sens = extracted

        result = {
            **cur_input,
            'objective-facts': objective_facts,
            'sens': sens
        }
        return result, i

    except Exception as e:
        print("Exception debug:", repr(e))
        raise
  # or you can return an error result


def extract_objective_facts(text):
    """
    Extracts objective facts and their referenced sentence numbers.

    Parameters:
        text (str): The input text content.

    Returns:
        tuple: A tuple containing two lists.
            - objective_facts: A list of detailed descriptions of the objective facts.
            - sen_numbers: A list of sentence numbers as a formatted string corresponding to each objective fact.
    """
    # Regex pattern to match <detailed-desc> and <sentences-used> blocks
    pattern = r'<detailed-desc>(.*?)</detailed-desc>\s*<sentences-used>\[Sen\s*([^\]]+)\]</sentences-used>'
    
    # Use re.findall to extract all matches
    matches = re.findall(pattern, text, re.DOTALL)
    
    objective_facts = []
    sen_numbers = []

    for desc, sensors in matches:
        # Append detailed description to the objective_facts list
        objective_facts.append(desc.strip())
        
        # Extract all numbers using regex
        numbers = [int(num) for num in re.findall(r'\d+', sensors)]
        # Sort numbers to ensure the ranges are correctly identified
        numbers.sort()
        
        # Process the numbers to detect ranges
        formatted_sens = []
        i = 0
        while i < len(numbers):
            start = numbers[i]
            while i < len(numbers) - 1 and numbers[i] + 1 == numbers[i + 1]:
                i += 1
            end = numbers[i]
            if start == end:
                formatted_sens.append(f"{start}")
            else:
                formatted_sens.append(f"{start}-{end}")
            i += 1
        
        # Create the formatted string
        sen_string = f"{','.join(formatted_sens)}"
        sen_numbers.append(sen_string)
    
    return objective_facts, sen_numbers


In [10]:
fact_extractor_prompt = read_text_file('prompts/fact_extractor.txt')
chunked_questions = load_json('data/chunked_questions.json')

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

all_num, success_num = 0, 0
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for i, cur_input in enumerate(chunked_questions):
        if 'objective-facts' not in cur_input:
            futures.append(executor.submit(process_input, cur_input, fact_extractor_prompt, i))

    all_num = len(futures)
    for future in tqdm(as_completed(futures), total=len(futures), dynamic_ncols=True):
        result, i = future.result(timeout=10*60)
        if result != None:
            chunked_questions[i] = result
            success_num += 1
            
           



Loaded 10 items from data/chunked_questions.json


 10%|█         | 1/10 [00:03<00:35,  3.98s/it]

An error occurred while processing input doc1_question2: too many values to unpack (expected 2)


 20%|██        | 2/10 [00:05<00:18,  2.31s/it]

An error occurred while processing input doc1_question1: too many values to unpack (expected 2)


 30%|███       | 3/10 [00:07<00:15,  2.17s/it]

An error occurred while processing input doc1_question4: too many values to unpack (expected 2)


 40%|████      | 4/10 [00:08<00:12,  2.02s/it]

An error occurred while processing input doc1_question3: too many values to unpack (expected 2)


 50%|█████     | 5/10 [00:12<00:13,  2.73s/it]

An error occurred while processing input doc1_question5: too many values to unpack (expected 2)


 60%|██████    | 6/10 [00:16<00:11,  3.00s/it]

An error occurred while processing input doc1_question7: too many values to unpack (expected 2)


 70%|███████   | 7/10 [00:17<00:07,  2.46s/it]

An error occurred while processing input doc1_question8: too many values to unpack (expected 2)


 80%|████████  | 8/10 [00:18<00:03,  1.88s/it]

An error occurred while processing input doc1_question6: too many values to unpack (expected 2)


 90%|█████████ | 9/10 [00:18<00:01,  1.39s/it]

An error occurred while processing input doc1_question9: too many values to unpack (expected 2)


100%|██████████| 10/10 [00:21<00:00,  2.18s/it]

An error occurred while processing input doc1_question10: too many values to unpack (expected 2)



