In [3]:
import os
import pathlib
from dotenv import load_dotenv

# 指定 .env 文件路径
env_path = r'C:\GitRepo\langchain-academy\module-1\.env'


# 加载 .env 文件
load_dotenv(dotenv_path=env_path)
print(f"The AZURE_OPENAI_ENDPOINT is: {os.getenv('AZURE_OPENAI_ENDPOINT')}")

os.environ["LANGSMITH_PROJECT"] = "langchain-academy-agentic-rag"

from langchain_openai import AzureChatOpenAI

azure_openai_llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    streaming=True,
)

from langchain_openai import AzureOpenAIEmbeddings

azure_openai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
)

# embeddings.embed_query("Hello, world!")


from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 本地向量存储的路径
persist_directory = r"C:\GitRepo\langchain-academy\module-4\agentic-rag\db\chroma_db_azure_docs"

vectorstore_force_update = False

# 检查向量存储是否已存在
if os.path.exists(persist_directory) and os.path.isdir(persist_directory) and len(os.listdir(persist_directory)) > 0 and not vectorstore_force_update:
    print(f"正在从本地加载向量存储: {persist_directory}")
    # 直接从本地加载向量存储
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=azure_openai_embeddings,
        collection_name="rag-chroma-azure-docs-markdown"
    )
else:
    print("本地向量存储不存在，创建新的向量存储...")
    urls = [
        # "https://lilianweng.github.io/posts/2023-06-23-agent/",
        # "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
        # "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
        "https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models",
        "https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter"

    ]

    docs = [WebBaseLoader(url).load() for url in urls]
    docs_list = [item for sublist in docs for item in sublist]

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=200
    )
    doc_splits = text_splitter.split_documents(docs_list)

    # 创建新的向量存储并保存到本地
    vectorstore = Chroma.from_documents(
        documents=doc_splits,
        collection_name="rag-chroma",
        persist_directory=persist_directory,
        embedding=azure_openai_embeddings,
    )
    # 保存到本地
    # LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.
    # vectorstore.persist()
    print(f"向量存储已保存到: {persist_directory}")

retriever = vectorstore.as_retriever()

from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
        "retrieve_azure_openai_docs",
        "Search and return information about Azure OpenAI models and content filtering.",
)

The AZURE_OPENAI_ENDPOINT is: https://jz-fdpo-swn.openai.azure.com/
正在从本地加载向量存储: C:\GitRepo\langchain-academy\module-4\agentic-rag\db\chroma_db_azure_docs


In [4]:
retriever.invoke("What is Azure content filtering and how does it work?")

[Document(metadata={'source': 'data\\content-filter-original-in-github.md'}, page_content="## Best practices\n\nAs part of your application design, consider the following best practices to deliver a positive experience with your application while minimizing potential harms:\n\n- Decide how you want to handle scenarios where your users send prompts containing content that is classified at a filtered category and severity level or otherwise misuse your application.\n- Check the `finish_reason` to see if a completion is filtered.\n- Check that there's no error object in the `content_filter_result` (indicating that content filters didn't run).\n- If you're using the protected material code model in annotate mode, display the citation URL when you're displaying the code in your application.\n\n## Next steps\n\n- Learn more about the [underlying models that power Azure OpenAI](../concepts/models.md).\n- Apply for modified content filters via [this form](https://ncv.microsoft.com/uEfCgnITdR).