In [1]:
import re
import openai
import numpy as np
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import TransformComponent
from llama_index.llms.openai import OpenAI
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings,StorageContext
from llama_index.core.objects import ObjectIndex
from typing import Sequence
from llama_index.core.tools import BaseTool, FunctionTool
import pickle
import os
import yaml
import json
import utils
print("Testing LLM...")
utils.test_llm(utils.get_api_key(name='openai'))
print("\nTesting Embedding Model...")
utils.test_embeddings(utils.get_api_key(name='openai'))

Testing LLM...
LLM test successful with model: gpt-4o-mini
Response: Hello! How can I assist you today?

Testing Embedding Model...
Embedding test successful!
Embedding shape: 3072


True

### Configuración de Llama Index

Modelos y embeddings

In [2]:
Settings.llm = OpenAI(api_key=utils.get_api_key(name='openai'),model="gpt-4o-mini", temperature=0.5, max_tokens=1024)
Settings.embed_model = OpenAIEmbedding(api_key=utils.get_api_key(name='openai'), model="text-embedding-3-large", embed_batch_size=10,dimensions=1024)

### Cargar documentos del RAG

In [3]:
book = SimpleDirectoryReader("../data/books").load_data()
len(book)
print(book[10])

Doc ID: 78b07c04-4f7a-445c-bb12-49d8d17ec8b5
Text: What is Data Science? 3 •When you start with the question you
often discover that you need to collect new data or design an experi-
ment to confirm you are getting the right answer. •It is easy to
discover structure or networks in a data set. There will always be
correlations for a thousand reasons if you collect enough data.
Understanding wheth...


### Pipeline de ingesta

* Crea chunks de un máximo de 512 tokens con 40 tokens de solape entre ellos
* Se crean los embeddings asociados a los chunks extraídos
* El resultado es la construcción de nodos
* Se limpian los nodos
* IngestionPipeline tiene la restricción que trabaja solo con clases que heredan de TransformerComponent
* Referencia : https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/transformations/#custom-transformations

In [4]:
class TextCleaner(TransformComponent):
    """
    A transformation component that cleans text by removing non-alphanumeric characters.

    This component takes in a list of nodes, removes any non-alphanumeric characters
    (except for spaces) from the text of each node, and returns the cleaned nodes.

    Args:
        nodes (list): A list of nodes to be cleaned.

    Returns:
        list: The list of cleaned nodes.
    """
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            node.text = re.sub(r"[^0-9A-Za-z ]", "", node.text)
        return nodes
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=1000, chunk_overlap=120),
        TextCleaner(),
    ],
)
nodes = pipeline.run(documents=book)
# Guardar los nodos en un archivo para reutilizarlos
with open('../data/models/processed_nodes.pkl', 'wb') as f:
    pickle.dump(nodes, f)
len(nodes)

153

### Se crea el índice de embeddings en memoria

* Cómo construyen el índica es un mecanismo interno de la librería
* Lo importante es que luego este índice se usa para encontrar los chunks más relevantes a la consulta del usuario por similitud de coseno
* https://www.reddit.com/r/LocalLLaMA/comments/1bvo5l4/the_more_i_use_llamaindex_the_less_i_like_it/
* referencia: https://docs.llamaindex.ai/en/stable/understanding/loading/loading/

In [5]:
vector_index = VectorStoreIndex(
        nodes=nodes,
        show_progress=True,
    )

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 153/153 [00:11<00:00, 13.52it/s]


In [6]:
vector_index.storage_context.persist('../data/models/')

## Evaluación del RAG

* Usamos una consulta directa a lo que generamos el RAG y lo comparamos con la respuesta que da usando el motor de chat

### Creamos las respuestas haciendo consultas con el query engine y luego con el conocimiento primario de GPT-4o-mini 

In [7]:
questions = [
    "1. Which are the specific stages of a data science project?",
    "2. Which are the roles in a data science team?",
    "3. Define success in data science experiments.",
    "3. Which are the software engineering principles for data science?"
]
query_responses = {}
chat_responses = {}
query_engine = vector_index.as_query_engine()
for i, question in enumerate(questions, 1):
    query_responses[f"question_{i}"] = query_engine.query(question)
    
chat_engine = vector_index.as_chat_engine(chat_mode="openai", verbose=False)
response = chat_engine.chat("Hi, please answer using your primary knowledge. Don't use alist inside of a question, use only sentences.Start every answer with the word \"Answer:\" and the number of the question.")
for i, question in enumerate(questions, 1):
    chat_responses[f"question_{i}"] = chat_engine.chat(question)

try:
    # Guardar las respuestas en un archivo
    utils.save_query_responses("../data/output/query_responses.txt", query_responses)
    # Imprimir confirmación
    print("Query responses have been saved to query_responses.txt")
except Exception as e:
    print(f"Query response saving failed: {str(e)}")
    
try:
    # Guardar las respuestas en un archivo
    utils.save_chat_responses("../data/output/chat_responses.txt", chat_responses)
    # Imprimir confirmación
    print("Chat responses have been saved to chat_responses.txt")
except Exception as e:
    print(f"Chat response saving failed: {str(e)}")


Query responses have been saved to query_responses.txt
Chat responses have been saved to chat_responses.txt


In [8]:
# Configura tu clave de API de OpenAI
openai.api_key = utils.get_api_key(name="openai")
# Leer respuestas desde los archivos
query_responses = utils.load_responses_from_txt("../data/output/query_responses.txt")
chat_responses = utils.load_responses_from_txt("../data/output/chat_responses.txt")

# Evaluar respuestas usando ChatGPT
evaluations = utils.compare_responses_with_chatgpt(
    client=openai, 
    query_responses=query_responses, 
    chat_responses=chat_responses, 
    model="gpt-4",
    temperature=0.2,
    max_tokens=1024
)
output_file = "../data/output/response_evaluation.txt"
os.makedirs(os.path.dirname(output_file), exist_ok=True)
utils.save_evaluations_to_file(output_file, evaluations)

Evaluaciones guardadas en ../data/output/response_evaluation.txt
