In [19]:
import os
from uuid import uuid4
from openai import OpenAI
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"question": "tweet"},
    id=1,
)

documents = [
    document_1,
]

from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configurar el text splitter con parámetros más apropiados para FAQ
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Reducido para mejor manejo de FAQs
    chunk_overlap=200,  # Aumentado para mejor contexto
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Dividir los documentos manteniendo los metadatos
all_splits = text_splitter.split_documents(documents)

# Asegurarse de que cada split mantenga los metadatos originales
for split in all_splits:
    # Asegurarse de que los metadatos contengan tanto la pregunta como la respuesta
    if 'question' not in split.metadata or 'answer' not in split.metadata:
        original_doc = next(d for d in documents if d.metadata['question'] == split.metadata['question'])
        split.metadata = original_doc.metadata
    split.id = str(uuid4())  # Asignar un nuevo ID único a cada split

client = OpenAI(
    # base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("OPENAI_API_KEY"), # TODO: Cambiar por GROQ_API_KEY
)
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # client=client,
)

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
    create_collection_if_not_exists=True,
)

# Crear y persistir la base de datos vectorial
knowledge_db = vector_store.from_documents(
    documents=all_splits,
    embedding=embeddings,
    collection_name="faq-collection",
    persist_directory="./chroma_langchain_db",
    ids=[doc.id for doc in all_splits]  # Pasar los IDs explícitamente
)