In [None]:
! pip install beautifulsoup4
! pip install chromadb

In [None]:
# -*- coding: utf-8 -*-
from langchain.agents import AgentExecutor, Tool, initialize_agent
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.tools import DuckDuckGoSearchRun
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import LongContextReorder

# -------------------- 1. RAG知识库构建 --------------------
# 加载新加坡气候数据
loader = WebBaseLoader(["https://www.weather.gov.sg/climate-climate-of-singapore/"])
docs = loader.load()

# 文档分块处理
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " "]
)
splits = text_splitter.split_documents(docs)

# 初始化带优化的向量数据库
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    collection_name="singapore-climate"
)

# -------------------- 2. 上下文重排序优化 --------------------
class OptimizedRetriever:
    def __init__(self, base_retriever):
        self.retriever = base_retriever
        self.reorder = LongContextReorder()
        
    def get_relevant_documents(self, query):
        # 扩大检索范围获取候选文档
        raw_docs = self.retriever.get_relevant_documents(query, k=10)
        # 应用重排序策略
        return self.reorder.transform_documents(raw_docs)[:3]  # 取排序后前三文档

base_retriever = vectorstore.as_retriever()
optimized_retriever = OptimizedRetriever(base_retriever)

# -------------------- 3. 工具系统定义 --------------------
# 实时搜索工具
search_tool = DuckDuckGoSearchRun()

# 知识库问答工具
climate_knowledge = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=optimized_retriever,
    return_source_documents=True
)

tools = [
    Tool(
        name="RealTime_Search",
        func=search_tool.run,
        description="实时信息检索，适用于天气、新闻等最新动态查询"
    ),
    Tool(
        name="Climate_Knowledge",
        func=climate_knowledge.run,
        description="新加坡气候知识库，包含历史数据和气候特征分析"
    )
]

# -------------------- 4. Agent初始化 --------------------
llm = ChatOpenAI(
    model="gpt-4-1106-preview",
    temperature=0.3,
    max_tokens=512
)

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent="zero-shot-react-description",
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=4  # 限制决策步骤防止死循环
)

# -------------------- 5. 执行示例 --------------------
if __name__ == "__main__":
    query = "今天新加坡的天气如何？与过去五年同期相比有什么显著变化？"
    
    response = agent.invoke({
        "input": query
    })
    
    print("\n" + "="*50)
    print(f"最终答案：\n{response['output']}")
    print("="*50)
    
    # 显示知识库引用来源
    if hasattr(climate_knowledge, 'source_documents'):
        print("\n引用的知识文档：")
        for doc in climate_knowledge.source_documents:
            print(f"- {doc.metadata['source']} (Page {doc.metadata.get('page', 'N/A')})")