Hay que subir todo esto a Github!!

# Ingesting base documents

In [2]:
# https://medium.com/@shikhararyan/how-to-create-a-vector-database-using-your-data-from-your-files-01794986ccfe
# https://github.com/nlmatics/llmsherpa#layoutpdfreader
# https://www.llamaindex.ai/blog/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125
# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-docker.html
# https://medium.com/@nanceeezhong/implementing-retrieval-augmented-generation-rag-for-pdf-file-with-llmsherpa-lancedb-and-openai-dfd1e879e915
# https://llamahub.ai/l/readers/llama-index-readers-smart-pdf-loader?from=all

# FOR CLEANING: https://docs.pinecone.io/integrations/llamaindex

# RUN PDF PARSING: docker run -p 80:5001 ghcr.io/nlmatics/nlm-ingestor:latest


### Global imports

In [1]:
from typing import Optional

### Environment variables

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# OPENAI
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# AWS
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION")
AWS_S3_OUTPUT_BUCKET = os.getenv("AWS_S3_OUTPUT_BUCKET")
AWS_S3_INPUT_BUCKET = os.getenv("AWS_S3_INPUT_BUCKET")

# FIRESTORE
CREDENTIALS_JSON_NAME = os.getenv("CREDENTIALS_JSON_NAME")
FIRESTORE_COLLECTION = os.getenv("FIRESTORE_COLLECTION")
FIRESTORE_PROJECT = os.getenv("FIRESTORE_PROJECT")
FIRESTORE_DATABASE = os.getenv("FIRESTORE_DATABASE")

# HUGGINFACE
HF_TOKEN = os.getenv("HF_TOKEN")

# PINECONE
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## Parsing documents

### Bucket definition

In [3]:
import boto3
import os

class Bucket:

  def __init__(self, name:str):
    self.name = name
    self.client = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION  # Optional
    )

  def list_objects(self):
    print(f"Listing objects in bucket: {self.name}")
    response = self.client.list_objects_v2(Bucket=self.name)
    return response.get('Contents', [])

  def download_object(self, key, directory_name, return_file_path=False):
    # Create the local directory if it doesn't exist
    os.makedirs(directory_name, exist_ok=True)
    local_file_path = os.path.join(directory_name, key)
    print(f"Downloading object from {self.name}: {key} to {local_file_path}")
    self.client.download_file(self.name, key, local_file_path)
    if return_file_path:
      return local_file_path

  def upload_object(self, local_file_path, key):
    print(f"Uploading object from {local_file_path} to {self.name}: {key}")
    self.client.upload_file(local_file_path, self.name, key)
    print(f"Uploaded object from {local_file_path} to {self.name}: {key}")

  def get_object(self, key):
    print(f"Getting object: {key} from {self.name}")
    response = self.client.get_object(Bucket=self.name, Key=key)
    return response['Body'].read()


### Parse documents from S3

In [None]:
import boto3
import os
from tqdm import tqdm
# Importante saber a detalle que hace SmartPDFLoader para la presentación
from llama_index.readers.smart_pdf_loader import SmartPDFLoader

bucket = Bucket(AWS_S3_INPUT_BUCKET)

# Parsing service
ec2_instance_ip = "52.15.212.245"
llmsherpa_api_url = f"http://{ec2_instance_ip}/api/parseDocument?renderFormat=all"

# Define a local directory to temporarily store downloaded files
local_directory = './s3_files/'

# List objects in the S3 bucket
response = bucket.list_objects()
documents = {}

print(f"Found {len(response)} objects")
document_titles = [obj['Key'] for obj in response]
print(f"The documents in bucket are the following:", document_titles)

# Check if the bucket has any files
if len(response) > 0:
    for obj in tqdm(document_titles):
        # Download the file from S3
        pdf_file_path = bucket.download_object(obj, local_directory, return_file_path=True)

        # Process the file with SmartPDF
        print(f"Loading {obj} with SmartPDF...")
        pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
        document = pdf_loader.load_data(pdf_file_path)

        print(f"Loaded {obj} with SmartPDF.")
        # Upload document HERE and create a dict that identifies each llama parsed document with its origin PDF
        documents[obj] = document
else:
    print("No files found in the S3 bucket.")

# Document Storage

In [6]:
from llama_index.core import Document
import firebase_admin
from firebase_admin import credentials, firestore


class Firebase:
  """
  Class to interact with Firebase Firestore.
  """

  def __init__(self):
    """
    Initialize the connection to Firebase
    """
    print("Initialized connection to Firebase")

    # Initialize Firebase
    firebase_credentials_file_path = self._get_credentials()
    self.cred = credentials.Certificate(firebase_credentials_file_path)

    try:
      firebase_admin.initialize_app(self.cred)
    except ValueError:
      print("Firebase app already initialized")

    # Get Firestore client
    self.db = firestore.client()
    self.collection_name_base = FIRESTORE_COLLECTION
    self.collection_name = self.collection_name_base + "_data"
    self.project = FIRESTORE_PROJECT
    self.database = FIRESTORE_DATABASE

    print(f"Set up connection to Firebase Database {self.database} in project {self.project}")


  def _get_credentials(self) -> str:
    credentials_json = CREDENTIALS_JSON_NAME
    bucket = Bucket(AWS_S3_OUTPUT_BUCKET)
    firebase_credentials_file_path = bucket.download_object(credentials_json, "credentials", return_file_path=True)
    return firebase_credentials_file_path


  def upload_documents(self, documents) -> None:
    from llama_index.storage.kvstore.firestore import FirestoreKVStore
    from llama_index.storage.docstore.firestore import FirestoreDocumentStore
    from llama_index.core.node_parser import SentenceSplitter

    nodes = SentenceSplitter().get_nodes_from_documents(documents)
    print(f"Uploading {len(nodes)} nodes to Firebase")

    kvstore = FirestoreKVStore(
      project=self.project,
      database=self.database,
      credentials=self.cred
    )

    docstore = FirestoreDocumentStore(
        firestore_kvstore=kvstore,
        namespace=self.collection_name_base
    )

    print("Adding nodes to Firebase...")
    docstore.add_documents(nodes)
    print("Nodes loaded!!")


  def get_document(self, document_id:str) -> dict:
    print(f"Getting document {document_id} from collection {self.collection_name}")
    doc_ref = self.db.collection(self.collection_name).document(document_id)
    doc = doc_ref.get()
    if doc.exists:
        print(f"Document {document_id} found in collection {self.collection_name}")
        return doc.to_dict()
    else:
        print(f"Document {document_id} NOT found in collection {self.collection_name}")
        return None


  def get_all_documents(
    self,
    limit:Optional[int] = None,
    only_paragraph:bool = False,
    document_title:Optional[str] = None
    ) -> list:
      print(f"Getting all documents from collection {self.collection_name}")
      if limit:
        print(f"Limiting to {limit} documents")

      docs = self.db.collection(self.collection_name)

      if only_paragraph:
        print("Getting only paragraphs in provided documents")
        docs = docs.where('data.metadata.chunk_type', '==', 'para')

      if document_title:
        print("Getting documents from the following file: ", document_title)
        docs = docs.where('data.metadata.title', '==', document_title)

      if limit:
        docs = docs.limit(limit)

      docs = docs.stream()

      documents = [doc.to_dict() for doc in docs]
      return self._documents_to_llama_index_documents(documents)


  def _dict_to_llama_index_document(self, doc_dict):
      # Extract fields from the dictionary
      document_data = doc_dict.get('data', {})
      document_id = document_data.get('id_', None)

      if document_id is None:
          raise ValueError("Document ID not found in the dictionary.")

      text_content = document_data.get('text', '')
      metadata = document_data.get('metadata', {})

      # Create a LlamaIndex Document
      document = Document(
          doc_id=document_id,
          text=text_content,
          metadata=metadata
      )

      return document

  def _documents_to_llama_index_documents(self, documents):
      print("Converting documents to LlamaIndex documents...")
      import os
      from tqdm import tqdm
      llama_index_documents = []
      for doc in tqdm(documents):
          llama_index_document = self._dict_to_llama_index_document(doc)
          llama_index_documents.append(llama_index_document)
      print("Conversion successful!")
      return llama_index_documents


AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

## Uploading Documents

In [None]:
# Add doc title to metadata from documents dictionary
for key, docs in documents.items():
  list(
      map(
          lambda doc: doc.metadata.update({'title': key}),
          docs
          )
      )

In [None]:
parsed_documents = [item for sublist in list(documents.values()) for item in sublist]

In [None]:
firebase = Firebase()
firebase.upload_documents(parsed_documents)

# Evaluation Questions

## Generate and upload questions

In [None]:
# Get from ChatGPT questions for given input documents
import openai
from tqdm import tqdm

class QuestionGenerator:

  def __init__(self):
    print("Initiating Question Generator with OpenAI")
    openai.api_key = os.environ["OPENAI_API_KEY"]

  def generate_question_from_document(self, llama_document) -> str:
      # Step 1: Get the content from the LlamaIndex document
      document_content = llama_document.get_text()
      print("Generating a question for the following text:", document_content)

      # Step 2: Prepare the prompt to ask GPT to generate a question
      system = "You are a helpful assistant that generates questions in Spanish given some texts. The questions must: \n1. Not repeat.\n2. Be only about something in the text.\n3. If not question can be made from the text, return an empty value like ''."
      messages = [{"role": "system", "content": system},]

      prompt = f"Read the following document and generate a relevant question about its content in Spanish:\n\n{document_content}"
      messages.append({"role": "user", "content": prompt})

      # Step 3: Use OpenAI to generate the question
      client = openai.OpenAI()

      response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
      )

      return response.choices[0].message.content

  def generate_questions_from_documents(self, llama_documents) -> list:
    questions = []
    for llama_document in tqdm(llama_documents):
      question = self.generate_question_from_document(llama_document)
      questions.append(question)
    return questions


In [None]:
question_generator = QuestionGenerator()

In [None]:
files_detected = list(set(map(lambda x: x.metadata['title'], files_documents)))
print(files_detected)

In [None]:
documents_to_eval = {}
files_detected = files_detected[:3]
input_files = files_documents
# Group documents by file it belongs to
for file in files_detected:
  print("Building documents and questions for: ", file)
  documents_to_eval[file] = {}
  documents_to_eval[file]['documents'] = list(filter(lambda x: x.metadata['title'] == file, input_files))
  documents_to_eval[file]['questions'] =  question_generator.generate_questions_from_documents(documents_to_eval[file]['documents'])

In [None]:
for file, vals in documents_to_eval.items():
  print("File: ", file)
  print("Docs: ", vals['documents'])
  print("Questions: ", vals['questions'])

In [None]:
eval_questions = []
for file, vals in documents_to_eval.items():
  eval_questions += list(set(vals['questions']))

print(f"Generated {len(eval_questions)} questions")

In [None]:
os.makedirs('evaluation', exist_ok=True)

In [None]:
with open('evaluation/questions.txt', 'w') as file:
    for q in eval_questions:
        file.write(f"{q}\n")

In [None]:
bucket = Bucket('rag-outputs-pdf')
bucket.upload_object('evaluation/questions.txt', 'evaluation_questions.txt')

# RAGs

## Base RAG

In [11]:
# Empty list of rags for evaluation comparison
rags = {}

In [5]:

class RAG:
  """
    Sets up a RAG with a Pinecone index and a OpenAI LLM model with llama-index.
    This is the BASE version.
  """

  def __str__(self):
    return self.name

  def __init__(self, full_refresh:bool = False):
    self.name = "BaseRag"
    self.index_name = f"indexes/{self.name}_index"

    self._setup(full_refresh)

  def _setup(
    self,
    full_refresh:bool = False,
    similarity_top_k:int = 4
    ) -> None:
    """
    Sets up the RAG step-by-step.

    Args:
    full_refresh (bool): Whether to build the index from 0.
    similarity_top_k (int): Number of similar documents to retrieve from the index.
    """
    
    from llama_index.core import (
      VectorStoreIndex,
      StorageContext,
      load_index_from_storage
    )
    from llama_index.vector_stores.pinecone import PineconeVectorStore
    self.log_divider = "\n" + "*"*100 + "\n"

    if not full_refresh:
      print("Not building index from 0...")
      self.set_pinecone_index()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      storage_context = StorageContext.from_defaults(
          persist_dir=self.index_name,
          vector_store= self.vector_store
      )
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
        ).as_retriever(
          similarity_top_k=4
      )
      index = load_index_from_storage(storage_context)
    else:
      print("Building index from 0❗️")

      input_documents = self.set_input_documents()
      self.set_pinecone_index()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      self.storage_context = StorageContext.from_defaults(
        vector_store= self.vector_store
      )
      self.set_up_embedding_model()
      self.set_up_llm_model()
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
        ).as_retriever(
        similarity_top_k=similarity_top_k
      )

      print("\nFinished setting up dependencies. 🎉")

      print("\nBuilding index...")

      index = VectorStoreIndex.from_documents(
          input_documents,
          embed_model=self.embed_model,
          storage_context=self.storage_context,
          text_key="text",
          show_progress=True
        )
      index.storage_context.persist(self.index_name)

    print("\nFinished building index 🚀\n")

    self.set_up_prompt()
    self.query_engine = index.as_query_engine(
        text_qa_template=self.prompt
    )

    print(self.pc_index.describe_index_stats())
    self.display_prompt_dict()

  def retrieve(self, query:str, only_text:bool = False) -> list:
    """
    Retrieves documents from the vector store given a query.

    Args:
    query (str): The query to retrieve documents for.
    only_text (bool): Whether to return only the text of the documents.

    Returns:
    list: List of dictionaries with the text, score, and source of the document.
    """
    print("Retrieving documents from vector store...")
    results = self.retriever.retrieve(query)

    if only_text:
      print("Returning only text...")
      return [doc.node.text for doc in results]

    return [
        {
            "text": doc.node.text,
            "score": doc.score,
            "source": doc.metadata
        } for doc in results
      ]


  def query(self, query:str) -> str:
    """
    Queries the RAG with a given question.

    Args:
    query (str): The question to query the RAG with.

    Returns:
    str: The answer to the question.
    """
    return self.query_engine.query(query)


  def set_up_prompt(self) -> None:
    """
    Sets up the prompt for the RAG.
    """
    from llama_index.core import PromptTemplate
    qa_template_text = (
        "You will receive a question in Spanish.\n"
        "The question is based in the given context, since it contains the theory on which the question must be answered.\n"
        "Context information is below.\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "Given the context information and not prior knowledge, "
        "answer the query in Spanish.\n"
        "Query: {query_str}\n"
        "Answer: "
    )
    self.prompt = PromptTemplate(qa_template_text)


  def display_prompt_dict(self) -> None:
    """
    Displays the prompt dictionary for the RAG.
    """
    from IPython.display import Markdown, display
    prompts_dict = self.query_engine.get_prompts()
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))


  def set_up_llm_model(self, chunk_size:Optional[int] = None) -> None:
    """
    Sets up the LLM model for the RAG.

    Args:
    chunk_size (int): The chunk size to use for the LLM model input.
    """
    from llama_index.llms.openai import OpenAI
    from llama_index.core import Settings

    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    open_ai_model = "gpt-4o-mini"

    print("Initializing llm model...")
    llm_model = OpenAI(
        model=open_ai_model,
        temperature=0.1,
        max_tokens=1000,
        logprobs=False,
        default_headers={}
    )

    if chunk_size:
      Settings.chunk_size = chunk_size

    Settings.llm = llm_model
    self.llm_model = llm_model
    print("Finished setting up llm model:", llm_model)
    print(self.log_divider)


  def set_input_documents(self) -> list:
    """
    Sets up the input documents for the RAG.
    These documents will be downloaded from firebase and used to build the context.
    The documents are llama-index document objects, not .pdf files or such.

    Returns:
    list: List of llama-index documents.
    """
    pdf_files_titles = [
      "Tipos de aprendizaje.pdf",
      "Algoritmo de Bayes Ingenuo.pdf",
      "Conceptos de evaluación.pdf"
    ]
    files_documents = []
    firebase = Firebase()

    for file in pdf_files_titles:
      print(f"\nGetting all documents for {file} from Firebase")
      # Tratar de agarrar una muestra de X documentos de cada archivo.
      llama_index_documents = firebase.get_all_documents(
          limit=50,
          document_title=file
        )
      #llama_index_documents = documents_to_llama_index_documents(documents)
      print(f"Downloaded {len(llama_index_documents)} documents for {file} from Firebase")
      files_documents += llama_index_documents

    print(f"Downloaded {len(files_documents)} documents from Firebase")
    print(self.log_divider)
    return files_documents[:5]

  def set_up_embedding_model(self) -> None:
    """
    Sets up the embedding model for the RAG used to encode the documents.
    """
    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
    from llama_index.core import Settings
    import os

    if "OPENAI_API_KEY" in os.environ:
        # To force the model to use the HuggingFace model and token
        del os.environ["OPENAI_API_KEY"]

    os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

    print("Initializing embedding model...")
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3")

    Settings.embed_model = embed_model
    self.embed_model = embed_model

    # just in case
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("Finished setting up embedding model:", embed_model)
    print(self.log_divider)


  def set_pinecone_index(self) -> None:
    """
      Set up the context used by the RAG with Pinecone.
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Initializing pinecone...")

    pc = Pinecone(api_key=PINECONE_API_KEY)

    pc_index_name = "quickstart"

    try:
      pc.create_index(
          name=f"{pc_index_name}",
          dimension=1024,
          metric="euclidean",
          spec=ServerlessSpec(cloud="aws", region="us-east-1"),
      )
    except Exception as e:
      print(e)
    finally:
      pc_index = pc.Index(f"{pc_index_name}")
      print("Finished setting up pinecone index:", pc_index.describe_index_stats())
      self.pc_index = pc_index
      print(self.log_divider)


  def get_pinecone_record(self, document_id:str) -> dict:
    """
    Get a record from Pinecone by document ID.

    Args:
    document_id (str): The document ID to retrieve from Pinecone.

    Returns:
    dict: The record from Pinecone.
    """
    response = self.storage_context.query(
      filter={"doc_id": {"$eq": document_id}},
      vector=[0] * 1024,
      top_k=1,
      include_metadata=True
    )
    return response["matches"][0]


  def get_evaluation_questions(self) -> str:
    """
    Downloads the evaluation questions from S3.

    Returns:
    str: The local path to the evaluation questions file.
    """
    bucket = Bucket(AWS_S3_OUTPUT_BUCKET)
    eval_questions_path = bucket.download_object(
      "evaluation_questions.txt",
      "evaluation",
      return_file_path=True
    )
    print("Evaluation questions have been downloaded in the following path:", eval_questions_path)
    return eval_questions_path

# try:
#   rag = RAG()
# except:
#   print("No index persisted.")
rag = RAG(full_refresh=True)

try: 
  rag.query("¿Qué es el aprendizaje supervisado?")
  rags[rag.name] = rag
except Exception as e:
  print("Index created incorrectly: ", e)


KeyboardInterrupt: 

### Context Index

In [None]:
import hashlib

def generate_hash(document):
    return hashlib.sha256(document.encode('utf-8')).hexdigest()

In [None]:
def check_document_in_pinecone(doc_hash):
    # Assuming `index` is your Pinecone index
    response = index.fetch(ids=[doc_hash])
    return response['result'][doc_hash]['exists']

In [None]:
# Summarization

## Diferentes tecnicas de retrieval de contexto

### Using Sentence Window Retrieval

In [None]:
class SentenceRetrievalRag(RAG):
  """
    Child class of RAG that sets up a Sentence Retrieval RAG with Pinecone.
    By applying a node parser, the RAG can retrieve sentences instead of
    full documents when building the context or any given query.
    Source: https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/MetadataReplacementDemo/
  """

  def __init__(self, full_refresh:bool = False, similarity_top_k:int = 4):
    self.name = "SentenceRetrievalRag"
    self.index_name = f"indexes/{self.name}_index"

    self._setup(full_refresh, similarity_top_k)


  def set_node_parser(
      self,
      window_size:int = 4,
      window_metadata_key:str = "window"
  ) -> None:
    """
    Sets up the node parser for the RAG.
    
    Args:
    window_size (int): The window size to use for the node parser.
    window_metadata_key (str): The metadata key to use for the window.
    """
    from llama_index.core.node_parser import SentenceWindowNodeParser
    from llama_index.core.postprocessor import MetadataReplacementPostProcessor
    from llama_index.core import Settings

    print("Initializing node parser...")
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=window_size,
        window_metadata_key=window_metadata_key,
        original_text_metadata_key="text",
    )
    self.node_parser = node_parser
    Settings.node_parser = node_parser

    # Sentence Window postprocessors
    print("Initializing postprocessors for nodes...")
    self.post_processors = [
        MetadataReplacementPostProcessor(target_metadata_key="window"),
        # SentenceTransformerRerank(
        #     top_n=rerank_top_n,
        #     model="BAAI/bge-reranker-base"
        # )
    ]
    print("Finished setting up node parser:", node_parser)
    print(self.log_divider)


  def _setup(
    self,
    full_refresh:bool = False,
    similarity_top_k:int = 4
    ) -> None:
    """
    Sets up the RAG step-by-step.

    Args:
    full_refresh (bool): Whether to build the index from 0.
    similarity_top_k (int): Number of similar documents to retrieve from the index.
    """
    from llama_index.vector_stores.pinecone import PineconeVectorStore
    from llama_index.core import (
      VectorStoreIndex, 
      StorageContext, 
      load_index_from_storage
    )
    self.log_divider = "\n" + "*"*100 + "\n"

    if not full_refresh:
      print("Not building index from 0...")
      self.set_pinecone_index()
      self.set_node_parser()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      storage_context = StorageContext.from_defaults(
          persist_dir=self.index_name,
          vector_store= self.vector_store
      )
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
      ).as_retriever(
        similarity_top_k=4
      )
      print("Loading index from storage...")
      index = load_index_from_storage(
          storage_context
      )
    else:
      print("Building index from 0❗️")

      input_documents = self.set_input_documents()[:5]
      self.set_pinecone_index()
      self.set_node_parser()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      self.storage_context = StorageContext.from_defaults(vector_store= self.vector_store)
      self.set_up_embedding_model()
      self.set_up_llm_model()
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
      ).as_retriever(
        similarity_top_k=4
      )

      print("\nFinished setting up dependencies. 🎉")

      print("\nBuilding index...")
      index = VectorStoreIndex(
          input_documents,
          storage_context=self.storage_context,
          text_key="text",
          show_progress=True
        )
      index.storage_context.persist(self.index_name)

    print("\nFinished building index 🚀\n")

    self.set_up_prompt()
    self.query_engine = index.as_query_engine(
        text_qa_template=self.prompt,
        node_postprocessors=self.post_processors,
        similarity_top_k=similarity_top_k
    )

    print(self.pc_index.describe_index_stats())
    self.display_prompt_dict()



try:
  rag_2 = SentenceRetrievalRag()
except:
  print("No index persisted.")
  rag_2 = SentenceRetrievalRag(full_refresh=True)
rags[rag_2.name] = rag_2


### Using automerging index

In [None]:
class AutoMergingRetrievalRag(RAG):

  def __init__(self, full_refresh:bool = False, similarity_top_k:int = 4):
    self.name = "AutomergingRetrievalRag"
    self.index_name = f"indexes/{self.name}_index"

    self._setup(full_refresh, similarity_top_k)


  def set_node_parser(self) -> None:
    from llama_index.core import Settings
    from llama_index.core.node_parser import HierarchicalNodeParser

    print("Initializing node parser...")
    node_parser = HierarchicalNodeParser.from_defaults(
        chunk_sizes=[2048, 512, 128]
    )
    self.node_parser = node_parser
    Settings.node_parser = node_parser

    # Automerging postprocessors
    #print("Initializing postprocessors for nodes...")
    # self.post_processors = [
    #     SentenceTransformerRerank(
    #         top_n=rerank_top_n,
    #         model="BAAI/bge-reranker-base"
    #     )
    # ]
    print("Finished setting up node parser:", node_parser)
    print(self.log_divider)


  def _setup(
    self,
    full_refresh:bool = False,
    similarity_top_k:int = 4
  ) -> None:
    """
    Sets up the RAG step-by-step.

    Args:
    full_refresh (bool): Whether to build the index from 0.
    similarity_top_k (int): Number of similar documents to retrieve from the index.
    """
    from llama_index.core import (
      VectorStoreIndex,
      StorageContext,
      load_index_from_storage,
    )
    from llama_index.vector_stores.pinecone import PineconeVectorStore

    self.log_divider = "\n" + "*"*100 + "\n"

    if not full_refresh:
      print("Not building index from 0...")
      self.set_pinecone_index()
      self.set_node_parser()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      storage_context = StorageContext.from_defaults(
          persist_dir=self.index_name,
          vector_store= self.vector_store
      )
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
      ).as_retriever(
        similarity_top_k=4
      )
      print("Loading index from storage...")
      index = load_index_from_storage(
          storage_context
      )
    else:
      print("Building index from 0❗️")

      input_documents = self.set_input_documents()[:5]
      self.set_pinecone_index()
      self.set_node_parser()
      self.vector_store = PineconeVectorStore(pinecone_index=self.pc_index)
      self.storage_context = StorageContext.from_defaults(vector_store= self.vector_store)
      self.set_up_embedding_model()
      self.set_up_llm_model()
      self.retriever = VectorStoreIndex.from_vector_store(
        self.vector_store
      ).as_retriever(
        similarity_top_k=similarity_top_k
      )

      print("\nFinished setting up dependencies. 🎉")

      print("\nBuilding index...")
      index = VectorStoreIndex(
          input_documents,
          storage_context=self.storage_context,
          text_key="text",
          show_progress=True
        )
      index.storage_context.persist(self.index_name)

    print("\nFinished building index 🚀\n")

    self.set_up_prompt()
    self.query_engine = index.as_query_engine(
        text_qa_template=self.prompt,
        similarity_top_k=similarity_top_k
    )

    print(self.pc_index.describe_index_stats())
    self.display_prompt_dict()



try:
  rag_3 = SentenceRetrievalRag()
except:
  print("No index persisted.")
  rag_3 = AutoMergingRetrievalRag(full_refresh=True)

rags[rag_3.name] = rag_3


# Evaluation

In [None]:
rags_for_eval = list(rags.values())
print("RAGs to evaluate: ", rags_for_eval)

Hacer llamadas a ChatGPT para que dado un X prompt me de una serie de preguntas para evitar trabajo manual usando LLM. En un notebook separado.

In [38]:
# https://colab.research.google.com/github/truera/trulens/blob/main/examples/quickstart/quickstart.ipynb#scrollTo=2jymH4ksjq2Q
# https://www.trulens.org/trulens/getting_started/quickstarts/llama_index_quickstart/#see-the-power-of-context-filters

In [None]:
! pip install --upgrade llama-index
! pip install trulens-apps-llamaindex

In [23]:
import os
import openai

openai.api_key = OPENAI_API_KEY

In [24]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
eval_questions_path = rag.get_evaluation_questions()
eval_questions = []
with open(eval_questions_path, 'r') as file:
    for line in file:
        item = line.strip()
        eval_questions.append(item)

In [None]:
from trulens.apps.custom import instrument
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [27]:
def get_trulens_app(rag, feedbacks:list):
  """
  Get trulens app for a given rag and feedbacks.

  Args:
    rag: RAG object.
    feedbacks: List of feedbacks.

  Returns:
    trulens app.
  """
  from trulens.apps.llamaindex import TruLlama

  return TruLlama(
    rag.query_engine,
    app_name="LlamaIndex_App",
    app_version=rag.name,
    feedbacks=feedbacks,
)


def build_trulens_recorder(rag):
  """
  Build trulens recorder for a given rag, setting the feedbacks functions and building the recorder app.

  Args:
    rag: RAG object.

  Returns:
    trulens recorder app.
  """
  from trulens.providers.openai import OpenAI
  from trulens.apps.llamaindex import TruLlama
  from trulens.core import Feedback
  from trulens.core import Select
  import numpy as np

  provider = OpenAI(model_engine="gpt-4o-mini")
  context = TruLlama.select_context(rag.query_engine)

  # Define a groundedness feedback function
  f_groundedness = (
      Feedback(
          provider.groundedness_measure_with_cot_reasons, name="Groundedness"
      )
      .on(context.collect())  # collect context chunks into a list
      .on_output()
  )

  # Question/answer relevance between overall question and answer.
  f_answer_relevance = Feedback(
      provider.relevance_with_cot_reasons, name="Answer Relevance"
  ).on_input_output()

  # Question/statement relevance between question and each context chunk.
  f_context_relevance = (
      Feedback(
          provider.context_relevance_with_cot_reasons, name="Context Relevance"
      )
      .on_input()
      .on(context)
      .aggregate(np.mean)
  )

  feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance]

  return get_trulens_app(rag, feedbacks)


def eval_rags(rags:list, questions:list):
  """
  Evaluate a list of rags for a list of questions.

  Args:
    rags: List of RAG objects.
    questions: List of questions for evaluation.

  Returns:
    None
  """
  from tqdm import tqdm

  for r in rags:
    print("Evaluating: ", r.name)
    tru_query_engine_recorder = build_trulens_recorder(r)

    with tru_query_engine_recorder as recording:
      for q in tqdm(questions):
        r.query(q)

    print("\nFinished evaluation\n")

  session.get_leaderboard()


In [None]:
eval_rags(rags_for_eval, eval_questions[:10])

In [None]:
session.get_leaderboard()

Firebase -> Documentos ya "limpios"
Pinecone -> Embeddings de vectores