In [1]:
import os
# 【硬性要求】加载环境变量 (科研好习惯：始终保持环境配置的注入)
from dotenv import load_dotenv
load_dotenv(override=True)

# 导入基于 Web 的文档加载器
from langchain_community.document_loaders import WebBaseLoader

# ==========================================
# 1. 定义目标数据源
# ==========================================
# 这里使用的是老师演示的 The Verge 关于 Meta AI 助手的文章
URL = "https://www.theverge.com/2024/4/18/24133808/meta-ai-assistant-llama-3-chatgpt-openai-rival"

# ==========================================
# 2. 实例化并执行加载 (Scraping & Parsing)
# ==========================================
print(f"正在从 {URL} 抓取并解析数据...")

try:
    # 实例化加载器
    loader = WebBaseLoader(URL)
    
    # 执行加载，返回一个 Document 对象的列表
    data = loader.load()
    
    print("\n✅ 网页加载完成！")
    
    # ==========================================
    # 3. 验证单元素特性与元数据 (科研级重点)
    # ==========================================
    # 验证老师提到的：网页加载后，列表长度为 1
    print(f"提取到的 Document 对象数量: {len(data)}") 
    
    # 取出这唯一的一个文档对象
    web_doc = data[0]
    
    print("\n=== 自动提取的科研元数据 (Metadata) ===")
    # 观察它如何智能抓取了 title, description, source 等属性
    for key, value in web_doc.metadata.items():
         print(f"- {key.capitalize()}: {value}")
         
    print("\n=== 清洗后的纯文本预览 (前 300 字符) ===")
    # .strip() 去除首尾的换行和空格
    print(web_doc.page_content.strip()[:300] + "...\n")

except Exception as e:
    print(f"❌ 抓取失败，请检查网络连接或依赖库: {e}")

USER_AGENT environment variable not set, consider setting it to identify your requests.


正在从 https://www.theverge.com/2024/4/18/24133808/meta-ai-assistant-llama-3-chatgpt-openai-rival 抓取并解析数据...

✅ 网页加载完成！
提取到的 Document 对象数量: 1

=== 自动提取的科研元数据 (Metadata) ===
- Source: https://www.theverge.com/2024/4/18/24133808/meta-ai-assistant-llama-3-chatgpt-openai-rival
- Title: Meta releases new AI assistant powered by Llama 3 model | The Verge
- Description: Meta released Llama 3 and is expanding access to the Meta AI bot. Meta CEO Mark Zuckerberg says the company has built “the most intelligent AI assistant” available for free.
- Language: en-US

=== 清洗后的纯文本预览 (前 300 字符) ===
Meta releases new AI assistant powered by Llama 3 model | The VergeSkip to main contentThe homepageThe VergeThe Verge logo.The VergeThe Verge logo.TechReviewsScienceEntertainmentAIPolicyHamburger Navigation ButtonThe homepageThe VergeThe Verge logo.Hamburger Navigation ButtonNavigation DrawerThe Ver...

