# Parse documents from S3

In [None]:
%run Bucket.ipynb
%run Firebase.ipynb

In [23]:
from config import settings
import os
from tqdm import tqdm
# Importante saber a detalle que hace SmartPDFLoader para la presentación
from llama_index.readers.smart_pdf_loader import SmartPDFLoader

In [19]:
AWS_S3_INPUT_BUCKET = settings.aws_s3_input_bucket

In [None]:
bucket = Bucket(AWS_S3_INPUT_BUCKET)

# Parsing service
ec2_instance_ip = "3.18.101.52"
llmsherpa_api_url = f"http://{ec2_instance_ip}/api/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)

# Define a local directory to temporarily store downloaded files
local_directory = './s3_files/'

# List objects in the S3 bucket
response = bucket.list_objects()
documents = {}

print(f"Found {len(response)} objects")
document_titles = [obj['Key'] for obj in response]
print(f"The documents in bucket are the following:", document_titles)

processed_document_titles = []

In [None]:
llmsherpa_api_url

In [None]:

# Check if the bucket has any files
if len(response) > 0:
    for obj in tqdm(document_titles):
        # Download the file from S3
        pdf_file_path = bucket.download_object(obj, local_directory, return_file_path=True)

        # Process the file with SmartPDF
        print(f"Loading {obj} with SmartPDF...")
        document = pdf_loader.load_data(pdf_file_path)
        
        print(f"Loaded {obj} with SmartPDF.")
        # Upload document HERE and create a dict that identifies each llama parsed document with its origin PDF
        documents[obj] = document
        processed_document_titles.append(obj)
        time.sleep(2)
else:
    print("No files found in the S3 bucket.")

In [25]:
# Add doc title to metadata from documents dictionary
for key, docs in documents.items():
  list(
      map(
          lambda doc: doc.metadata.update({'title': key}),
          docs
          )
      )

In [78]:
parsed_documents = [item for sublist in list(documents.values()) for item in sublist]

In [80]:
from llama_index.core import Document


pending_docs = []
with open('parsed_docs/docs.txt', 'r') as file:
    line = file.readline()
    while line:
        pending_docs.append(eval(line))
        line = file.readline()

pending_docs = [Document(**doc) for doc in pending_docs]

In [None]:
firebase = Firebase()
stored_docs = firebase.get_all_document_ids()
docs_to_store = [doc for doc in pending_docs if doc.id_ not in stored_docs]

In [None]:
firebase.upload_documents(docs_to_store)

In [None]:
os.makedirs('parsed_docs', exist_ok=True)
with open('parsed_docs/docs.txt', 'w') as file:
    for d in docs_to_store:
        file.write(f"{d.__dict__}\n")

# Generate Evaluation questions

In [3]:
# Get from ChatGPT questions for given input documents
import openai
from tqdm import tqdm
import os

class QuestionGenerator:

  def __init__(self):
    print("Initiating Question Generator with OpenAI")
    openai.api_key = os.environ["OPENAI_API_KEY"]

  def generate_question_from_document(self, llama_document) -> str:
      # Step 1: Get the content from the LlamaIndex document
      document_content = llama_document.get_text()
      print("Generating a question for the following text:", document_content)

      # Step 2: Prepare the prompt to ask GPT to generate a question
      system = "You are a helpful assistant that generates questions in Spanish given some texts. The questions must: \n1. Not repeat.\n2. Be only about something in the text.\n3. If not question can be made from the text, return an empty value like ''."
      messages = [{"role": "system", "content": system},]

      prompt = f"Read the following document and generate a relevant question about its content in Spanish:\n\n{document_content}"
      messages.append({"role": "user", "content": prompt})

      # Step 3: Use OpenAI to generate the question
      client = openai.OpenAI()

      response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
      )

      return response.choices[0].message.content

  def generate_questions_from_documents(self, llama_documents) -> list:
    questions = []
    for llama_document in tqdm(llama_documents):
      question = self.generate_question_from_document(llama_document)
      questions.append(question)
    return questions


In [None]:
question_generator = QuestionGenerator()

In [8]:
# Read s3_files directory and get all the files
files_detected = os.listdir('s3_files')

In [None]:
documents_to_eval = {}
firebase = Firebase()
# Group documents by file it belongs to
for file in files_detected:
  print("Building documents and questions for: ", file)
  documents_to_eval[file] = {}
  documents_to_eval[file]['documents'] = firebase.get_all_documents(limit=50, document_title=file)
  documents_to_eval[file]['questions'] =  question_generator.generate_questions_from_documents(documents_to_eval[file]['documents'])

In [None]:
for file, vals in documents_to_eval.items():
  print("File: ", file)
  print("Docs: ", vals['documents'])
  print("Questions: ", vals['questions'])

In [14]:
os.makedirs('evaluation', exist_ok=True)

In [57]:
from collections import Counter

# Get only top 5 questions from each document
for file, d in documents_to_eval.items():
    count = Counter(d['questions'])
    top_questions = count.most_common(5)
    d['top_questions'] = [q[0] for q in top_questions]

In [None]:
import itertools

get_values = lambda data: list(itertools.chain.from_iterable(d['top_questions'] for d in documents_to_eval.values()))

questions = get_values(documents_to_eval)

questions = list(set(questions))

print("Number of questions: ", len(questions))

In [64]:
with open('evaluation/eval_questions.txt', 'w') as file:
    for q in questions:
        file.write(f"{q}\n")

In [None]:
bucket = Bucket('rag-outputs-pdf')
bucket.upload_object('evaluation/eval_questions.txt', 'evaluation_questions.txt')