In [1]:
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    model_config = SettingsConfigDict(extra="ignore")

    openai_api_key: str
    database_path: str = "./data"


settings = Settings()

In [2]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    collection_name="docs",
    embedding_function=embeddings,
    persist_directory=settings.database_path,
)

In [3]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="LangChain is a framework for building applications with LLMs.",
        id="1",
    ),
    Document(page_content="RAG stands for Retrieval-Augmented Generation.", id="2"),
    Document(
        page_content="FAISS is Facebook AI Similarity Search for vector indexing.",
        id="3",
    ),
]
ids = list(map(lambda doc: doc.id, documents))
vector_store.add_documents(documents=documents, ids=ids)

['1', '2', '3']

In [4]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAI

document_content_description = "Topics related to RAG"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, vector_store, document_content_description, [], enable_limit=True, verbose=True
)

retriever.invoke("What is LangChain?")

[Document(id='1', metadata={}, page_content='LangChain is a framework for building applications with LLMs.'),
 Document(id='2', metadata={}, page_content='RAG stands for Retrieval-Augmented Generation.'),
 Document(id='3', metadata={}, page_content='FAISS is Facebook AI Similarity Search for vector indexing.')]