In [1]:
import os
from dotenv import load_dotenv

# 指定 .env 文件路径
env_path = r'C:\GitRepo\langchain-academy\module-1\.env'

# 加载 .env 文件
load_dotenv(dotenv_path=env_path)
print(f"The AZURE_OPENAI_ENDPOINT is: {os.getenv('AZURE_OPENAI_ENDPOINT')}")

os.environ["LANGSMITH_PROJECT"] = "langchain-academy-agentic-rag"

from langchain_openai import AzureChatOpenAI

azure_openai_llm = AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    streaming=True,
)


The AZURE_OPENAI_ENDPOINT is: https://jz-fdpo-swn.openai.azure.com/


In [2]:
from langchain_openai import AzureOpenAIEmbeddings

azure_openai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
)

# embeddings.embed_query("Hello, world!")


In [5]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader
loader = DirectoryLoader("data", glob="**/*.md", loader_cls=TextLoader)
docs = loader.load()
docs


[Document(metadata={'source': 'data\\azure-openai-models.md'}, page_content='---\ntitle: Azure OpenAI Service models\ntitleSuffix: Azure OpenAI\ndescription: Learn about the different model capabilities that are available with Azure OpenAI.\nms.service: azure-ai-openai\nms.topic: conceptual\nms.date: 04/23/2025\nms.custom: references_regions, build-2023, build-2023-dataai, refefences_regions\nmanager: nitinme\nauthor: mrbullwinkle #ChrisHMSFT\nms.author: mbullwin #chrhoder#\nrecommendations: false\n---\n\n# Azure OpenAI Service models\n\nAzure OpenAI Service is powered by a diverse set of models with different capabilities and price points. Model availability varies by region and cloud. For Azure Government model availability, please refer to [Azure Government OpenAI Service](../azure-government.md).\n\n| Models | Description |\n|--|--|\n| [GPT-4.1 series](#gpt-41-series) | Latest model release from Azure OpenAI |\n| [computer-use-preview](#computer-use-preview) | An experimental model

In [6]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=100
)


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:

doc_splits = text_splitter.split_documents(docs)
doc_splits

[Document(metadata={'source': 'data\\azure-openai-models.md'}, page_content='---\ntitle: Azure OpenAI Service models\ntitleSuffix: Azure OpenAI\ndescription: Learn about the different model capabilities that are available with Azure OpenAI.\nms.service: azure-ai-openai\nms.topic: conceptual\nms.date: 04/23/2025\nms.custom: references_regions, build-2023, build-2023-dataai, refefences_regions\nmanager: nitinme\nauthor: mrbullwinkle #ChrisHMSFT\nms.author: mbullwin #chrhoder#\nrecommendations: false\n---\n\n# Azure OpenAI Service models\n\nAzure OpenAI Service is powered by a diverse set of models with different capabilities and price points. Model availability varies by region and cloud. For Azure Government model availability, please refer to [Azure Government OpenAI Service](../azure-government.md).\n\n| Models | Description |\n|--|--|\n| [GPT-4.1 series](#gpt-41-series) | Latest model release from Azure OpenAI |\n| [computer-use-preview](#computer-use-preview) | An experimental model

In [8]:

# # Add to vectorDB
# vectorstore = Chroma.from_documents(
#     documents=doc_splits,
#     collection_name="rag-chroma-azdocs-test",
#     # embedding=OpenAIEmbeddings(),
#     embedding=azure_openai_embeddings,
# )

persist_directory = "db\chroma_db_azure_docs"
# 创建新的向量存储并保存到本地
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma-azure-docs-markdown",
    persist_directory=persist_directory,
    embedding=azure_openai_embeddings,
)
# 保存到本地
# LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.
# vectorstore.persist()
print(f"向量存储已保存到: {persist_directory}")

retriever = vectorstore.as_retriever()

向量存储已保存到: db\chroma_db_azure_docs


In [11]:
# 直接使用vectorstore的带分数搜索方法
query = "What is the content filtering in Azure OpenAI?"
results_with_scores = vectorstore.similarity_search_with_score(query, k=3)

# 遍历结果
for doc, score in results_with_scores:
    print(f"文档内容: {doc.page_content[:300]}")
    print(f"相似度分数: {score}")
    print("---"*20)

文档内容: ## Best practices

As part of your application design, consider the following best practices to deliver a positive experience with your application while minimizing potential harms:

- Decide how you want to handle scenarios where your users send prompts containing content that is classified at a fi
相似度分数: 0.2565810978412628
------------------------------------------------------------
文档内容: ---
title: Azure OpenAI Service content filtering
titleSuffix: Azure OpenAI
description: Learn about the content filtering capabilities of Azure OpenAI in Azure AI services.
author: PatrickFarley
ms.author: pafarley
ms.service: azure-ai-openai
ms.topic: conceptual 
ms.date: 03/21/2025
ms.custom: tem
相似度分数: 0.2675418555736542
------------------------------------------------------------
文档内容: The safety system parses this structured format and applies the following behavior: 
- On the latest “user” content, the following categories of RAI Risks will be detected: 
    - Hate 
    - Sexual 
    - 

In [12]:
# # 设置retriever
# retriever = vectorstore.as_retriever(
#     search_type="similarity",  # 可选值: "similarity", "mmr", "similarity_score_threshold"
#     search_kwargs={"k": 5}     # 设置返回前5个结果
# )

# 如果要使用最大边际相关性搜索(MMR)
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 20})

# 如果要设置相似度阈值
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold", 
    search_kwargs={"score_threshold": 0.5, "k": 5}
    )

In [13]:
retriever.invoke("The Azure OpenAI content filter will block which types of content?")

[Document(metadata={'source': 'data\\content-filter-original-in-github.md'}, page_content="---\ntitle: Azure OpenAI Service content filtering\ntitleSuffix: Azure OpenAI\ndescription: Learn about the content filtering capabilities of Azure OpenAI in Azure AI services.\nauthor: PatrickFarley\nms.author: pafarley\nms.service: azure-ai-openai\nms.topic: conceptual \nms.date: 03/21/2025\nms.custom: template-concept, devx-track-python\nmanager: nitinme\n---\n\n# Content filtering\n\n> [!IMPORTANT]\n> The content filtering system isn't applied to prompts and completions processed by the audio models such as Whisper in Azure OpenAI Service. Learn more about the [Audio models in Azure OpenAI](models.md?tabs=standard-audio#standard-deployment-regional-models-by-endpoint).\n\nAzure OpenAI Service includes a content filtering system that works alongside core models, including DALL-E image generation models. This system works by running both the prompt and completion through an ensemble of classifi