# LangChain检索器
+ 基本的检索器
+ 词法搜索构建检索器

In [None]:
import os
from dotenv import load_dotenv

# 加载 .env 文件中的环境变量
load_dotenv(override=True)  # 使用 override=True 确保加载最新的 .env 数据

In [2]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    # https://api.siliconflow.cn/v1/embeddings
    base_url=os.environ.get("SILICONFLOW_API_BASE"),
    api_key=os.environ.get("SILICONFLOW_API_KEY"),
)

## 基本的检索器

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("../../sources/letter.txt")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# 进行向量存储
vectorstore = FAISS.from_documents(texts, embeddings_model)

In [None]:
# 将向量数据库实例化为检索器
retriever = vectorstore.as_retriever()

docs = retriever.invoke("deepseek是什么")
print(docs)

[Document(id='7468f31c-4f9d-451d-8ddd-4b62280e090b', metadata={'source': '../../sources/letter.txt'}, page_content='Marketing Initiatives and Campaigns\nOur marketing team has been actively working on developing new strategies to increase brand awareness and drive customer engagement. We would like to thank Sarah Thompson (phone: 415-555-1234) for her exceptional efforts in managing our social media platforms. Sarah has successfully increased our follower base by 20% in the past month alone. Moreover, please mark your calendars for the upcoming product launch event on July 15th. We encourage all team members to attend and support this exciting milestone for our company.'), Document(id='c4395f41-63be-4b60-99ea-dac6e81a1586', metadata={'source': '../../sources/letter.txt'}, page_content="[Generated with ChatGPT]\n\nConfidential Document - For Internal Use Only\n\nDate: July 1, 2023\n\nSubject: Updates and Discussions on Various Topics\n\nDear Team,\n\nI hope this email finds you well. In

In [None]:
## 词法搜索构建检索器
BM25也称为Okapi BM25，是信息检索系统种用来估计文档与给定搜索查询的相关性的排名函数

In [5]:
! pip install rank_bm25

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting rank_bm25
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/2a/21/f691fb2613100a62b3fa91e9988c991e9ca5b89ea31c0d3152a3210344f9/rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [7]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

# 使用所创建的文档来创建一个新的检索其
retriever = BM25Retriever.from_texts(["foo", "bar", "world", "hello", "foo bar"])

retriever = BM25Retriever.from_documents(
    [
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="world"),
        Document(page_content="hello"),
        Document(page_content="foo bar"),
    ]
)

In [8]:
# 进行检索
result = retriever.invoke("foo")
print(result)

[Document(metadata={}, page_content='foo'), Document(metadata={}, page_content='foo bar'), Document(metadata={}, page_content='hello'), Document(metadata={}, page_content='world')]


## 使用NLTK方式在预处理器进行增强，在词语级别检索效果比较显著

In [9]:
import nltk

# punkt 是一个句子分割器模型
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /home/jizhe/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
from nltk.tokenize import word_tokenize
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

retriever = BM25Retriever.from_documents(
    [
        Document(page_content="foo"),
        Document(page_content="bar"),
        Document(page_content="world"),
        Document(page_content="hello"),
        Document(page_content="foo bar"),
    ],
    k=2,
    preprocess_func=word_tokenize,
)

# 检索
retriever.invoke("bar")

[Document(metadata={}, page_content='bar'),
 Document(metadata={}, page_content='foo bar')]