In [7]:
# 2025/6/24
# zhangzhong
# https://python.langchain.com/docs/tutorials/retrievers/

from utils import load_env
load_env()

from langchain_core.documents import Document

# LangChain implements a Document abstraction, which is intended to represent a unit of text and associated metadata. It has three attributes:

# - page_content: a string representing the content;
# - metadata: a dict containing arbitrary metadata;
# - id: (optional) a string identifier for the document.

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]
documents


[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')]

In [8]:
# https://python.langchain.com/docs/tutorials/retrievers/#loading-documents

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents.base import Document

file_path = "mamba.pdf"
loader = PyPDFLoader(file_path)

# PyPDFLoader loads one Document object per PDF page
# 还真是，我们的manba论文总共有36页
docs: list[Document] = loader.load()

print(len(docs))
print(docs[0].page_content[:100], docs[0].metadata)

36
Mamba: Linear-Time Sequence Modeling with Selective State Spaces
Albert Gu∗1
and Tri Dao∗2
1
Machine {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-06-03T00:52:19+00:00', 'author': '', 'keywords': '', 'moddate': '2024-06-03T00:52:19+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'mamba.pdf', 'total_pages': 36, 'page': 0, 'page_label': '1'}


In [15]:
# https://python.langchain.com/docs/tutorials/retrievers/#splitting
# We can use text splitters for this purpose. Here we will use a simple text splitter that partitions based on characters. We will split our documents into chunks of 1000 characters with 200 characters of overlap between chunks
# We use the RecursiveCharacterTextSplitter, which will recursively split the document using common separators like new lines until each chunk is the appropriate size
# We set add_start_index=True so that the character index where each split Document starts within the initial Document is preserved as metadata attribute “start_index”.

# blabla, just give me an example!
from typing import List
from langchain_core.documents.base import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 这个类还可以自己指定 seperator 不过默认的seperator应该就是通用的
# 可能在处理英文的时候比较合适吧，处理中文的可能需要自己写？或者他是按照段落 空格 换行来做seperator的？
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

all_splits: List[Document] = text_splitter.split_documents(documents=docs)
len(all_splits)



206

In [None]:
# 可以看到在metadata里面多了一个 start_index 用于标注这个chunk在原文中的位置
# 这样就可以做引用了
print(all_splits[0].page_content[:100], all_splits[0].metadata)
print("----------------------------------")

print(all_splits[0].page_content)
print("----------------------------------")
print(all_splits[1].page_content)
print("----------------------------------")
print(all_splits[2].page_content)
print("----------------------------------")
# 我懂了，看输出效果，就是把原来的文本按照某些规则，比如固定的字符数，或者段落，给分开，然后两个相邻的chunk会overlap一段


In [10]:
# https://python.langchain.com/docs/tutorials/retrievers/#embeddings
# https://www.reddit.com/r/LocalLLaMA/comments/1iyun6z/using_deepseek_r1_for_rag_dos_and_donts/
# 找到一个好东西，说了用千问的embedding模型和deepseek的reason 模型 可以
# 我去看看千问有没有API可以买吧
# https://github.com/Graph-RAG/GraphRAG

In [12]:
# https://python.langchain.com/docs/integrations/text_embedding/dashscope/
# 找了半天，终于找到了这个
# https://bailian.console.aliyun.com/?tab=api#/api/?type=model&url=https%3A%2F%2Fhelp.aliyun.com%2Fdocument_detail%2F2842587.html
from langchain_community.embeddings import DashScopeEmbeddings

embeddings = DashScopeEmbeddings(
    model="text-embedding-v4"
)

text = "This is a test document."

query_result = embeddings.embed_query(text)
print(len(query_result)) # 默认是1024维的向量
print(query_result)

1024
[-0.038538604974746704, 0.03440577909350395, 0.026333853602409363, -0.046597618609666824, -0.05496659129858017, -0.01645381562411785, -0.012773016467690468, 0.044376224279403687, -0.06524699926376343, 0.12873753905296326, 0.004181257914751768, 0.0032949603628367186, -0.011681691743433475, 0.012630950659513474, -0.006935399491339922, -0.012340361252427101, 0.0030915478710085154, -0.0640588104724884, 0.004010132979601622, 0.005624518264085054, -0.04623599350452423, 0.02319548837840557, -0.0238024964928627, 0.009350519627332687, -0.01554975938051939, 0.009763802401721478, -0.0293947272002697, -0.08591112494468689, 0.019953802227973938, -0.008388346061110497, -0.0488448403775692, 0.023789580911397934, 0.007342224474996328, 0.00339020905084908, -0.04357548803091049, 0.038254473358392715, 0.016208428889513016, 0.03921018913388252, -0.016867097467184067, 0.0042619770392775536, -0.036653004586696625, 0.005660034716129303, -0.06571193784475327, 0.025662269443273544, -0.012146634981036186, 

In [14]:
# https://python.langchain.com/docs/tutorials/retrievers/#vector-stores

from langchain_core.vectorstores import InMemoryVectorStore

# create vector store by passing in the embeddings model
vector_store = InMemoryVectorStore(embeddings)

# add documents to the vector store
# qwen embedding batch size should not large than 10
# 所以这里需要分批处理
ids = []
for i in range(0, len(all_splits), 10):
    ids.extend(vector_store.add_documents(documents=all_splits[i:i+10]))


print(len(ids))
print(ids)
# 这里ids的长度和all splits的长度是一样的
# 也就是每个document都是返回一个id
# 那么这个id是干嘛的？

206
['b0121721-71a9-480e-a649-7249a6c1c6fa', '45f10c2b-6f32-4280-8bd5-b4e3154505f5', 'f177c9ea-6b26-4462-ba91-58e7c51a85ae', 'b1121ab5-87a1-46a1-888b-468fdbb2e990', 'be8e3550-7e03-4ace-9d3f-233fcc5e760b', 'cbac5b4e-782a-4861-bc24-7698e84aa450', '7faff6b0-04bd-48df-814b-6c631aecde18', 'defba535-b968-4c69-b5e0-45c0d4687df7', '909ed835-bfee-4429-8ddd-a5588f2f2a52', 'e89ef323-423e-423c-83e3-3a9287dca718', '6282ef9d-e33e-46ec-a32c-eb19cb517f7f', '8e26655f-6544-4000-a516-d37edc0977cb', '8bc666a1-76a2-4c9c-a76e-dc50dbb0e0f2', '0c7e423a-de6d-4f0c-92d5-d37bedfd73c4', 'f7f54e86-62dd-422e-8b55-5d5149ec242c', 'a39a658d-6608-4221-bfb4-caede0c70d29', '5c475ff8-5f38-4aab-a2a0-b522b8715d00', 'bc5618de-831b-4db8-8745-efe4dfb2d0fe', '922e6227-01d8-449a-8838-95047219825d', 'a3286e14-1354-42b3-a0ef-da3de069cbbc', 'bb6421b1-e18b-4541-9cb2-965803243e19', '74fbbbd3-909d-4605-9819-fc047d82debe', '7b7be6fe-8c63-44ad-bd4b-93fb2a6e4ca3', '855d54ae-70a8-49b5-a774-1c5f51349c3c', 'c279cbe9-cdc1-4f5b-a78b-49932cf8f1

In [18]:
results = vector_store.similarity_search("What is the main idea of the document?")
print(len(results))
# 这不对吧，这返回的东西和我们的问题完全没联系啊。。。
print(results[0].page_content)


# 还有async的
# results = await vector_store.asimilarity_search("When was Nike incorporated?")

# 还可以通过embedding进行搜索
# 其实我们用文本进行搜索 本质上再vector store里面还是会调用embedding模型 然后再在向量数据库里面做搜索
# embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")
# results = vector_store.similarity_search_by_vector(embedding)

4
modestly with a small increase in parameter count.
Of particular note is the dramatic improvement of the selective SSM when the state size 𝑁 is increased, with over a 1.0
perplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1
and 3.3.
5 Discussion
We discuss related work, limitations, and some future directions.
Related Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B has an
extended related work of SSMs and other related models.
16


In [19]:
# https://python.langchain.com/docs/tutorials/retrievers/#retrievers

retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6},
)

retriever.batch(
    [
        "What is the main idea of the document?",
        "What is the mamba block?",
    ],
)

[[Document(id='c38f7979-17f9-46aa-a412-3e094f7088c0', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-06-03T00:52:19+00:00', 'author': '', 'keywords': '', 'moddate': '2024-06-03T00:52:19+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'mamba.pdf', 'total_pages': 36, 'page': 15, 'page_label': '16', 'start_index': 2440}, page_content='modestly with a small increase in parameter count.\nOf particular note is the dramatic improvement of the selective SSM when the state size 𝑁 is increased, with over a 1.0\nperplexity improvement for a cost of only 1% additional parameters. This validates our core motivation in Sections 3.1\nand 3.3.\n5 Discussion\nWe discuss related work, limitations, and some future directions.\nRelated Work. Appendix A discusses how the selection mechanism relates to similar concepts. Appendix B ha

In [22]:
from langchain_deepseek import ChatDeepSeek
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate

model = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [23]:
# 后面还有东西才对，我们要把vector store 拿到的东西放到prompt里面 然后给到大模型啊
from langchain_core.messages import HumanMessage, SystemMessage

query = "你的问题，比如：What is the main idea of the document?"
results = vector_store.similarity_search(query, k=4)  # k 取你想要的数量
context = "\n\n".join([doc.page_content for doc in results])



system_prompt = "你是一个专业的文档问答助手。请根据以下 context 回答用户问题。\n\nContext:\n{context}"
user_prompt = "Question: {question}"

messages = [
    SystemMessage(system_prompt.format(context=context)),
    HumanMessage(user_prompt.format(question=query)),
]

# TODO: 我记得langchain处理这种rag有专门的chain来着，明天再看看吧
def rag_ask(query, vector_store, model, k=4):
    results = vector_store.similarity_search(query, k=k)
    context = "\n\n".join([doc.page_content for doc in results])
    system_prompt = "你是一个专业的文档问答助手。请根据以下 context 回答用户问题。\n\nContext:\n{context}"
    user_prompt = "Question: {question}"
    messages = [
        SystemMessage(system_prompt.format(context=context)),
        HumanMessage(user_prompt.format(question=query)),
    ]
    return model.invoke(messages)

In [None]:
answer = rag_ask("What is the main idea of the document?", vector_store, model)
print(answer)

content='The main idea of the document revolves around **selective state space models (SSMs)** and their advantages over traditional models like Transformers and linear time-invariant (LTI) SSMs. Key points include:\n\n1. **Selectivity in SSMs**:  \n   - The document highlights how introducing selectivity (e.g., through the parameter Δ) allows SSMs to dynamically focus on or ignore inputs, improving performance with minimal parameter overhead.  \n   - A small Δ retains the state and ignores the current input, while a large Δ resets the state and focuses on the input (similar to RNN gates).  \n\n2. **Performance Improvements**:  \n   - Increasing the state size (𝑁) in selective SSMs leads to significant perplexity improvements (over 1.0) with only a 1% increase in parameters.  \n   - Selective SSMs outperform recurrent baselines (e.g., RWKV, RetNet) on long sequences (e.g., 8k context length) due to better efficiency.  \n\n3. **Interpretation of Parameters**:  \n   - **Δ**: Governs inpu

The main idea of the document revolves around **selective state space models (SSMs)** and their advantages over traditional models like Transformers and linear time-invariant (LTI) SSMs. Key points include:\n\n1. **Selectivity in SSMs**:  \n   - The document highlights how introducing selectivity (e.g., through the parameter Δ) allows SSMs to dynamically focus on or ignore inputs, improving performance with minimal parameter overhead.  \n   - A small Δ retains the state and ignores the current input, while a large Δ resets the state and focuses on the input (similar to RNN gates).  \n\n2. **Performance Improvements**:  \n   - Increasing the state size (𝑁) in selective SSMs leads to significant perplexity improvements (over 1.0) with only a 1% increase in parameters.  \n   - Selective SSMs outperform recurrent baselines (e.g., RWKV, RetNet) on long sequences (e.g., 8k context length) due to better efficiency.  \n\n3. **Interpretation of Parameters**:  \n   - **Δ**: Governs input selection and state retention.  \n   - **𝑨, 𝑩, 𝑪**: Selectivity in Δ is sufficient for performance gains, though making 𝑨 selective could also help.  \n   - **Boundary Resetting**: Selective SSMs can reset states at sequence boundaries (e.g., document or episode boundaries), preventing information leakage.  \n\n4. **Comparison to Other Models**:  \n   - Selective SSMs address limitations of LTI SSMs (e.g., handling varying sequence contexts) and compete with Transformers, especially in long-context settings.  \n\n5. **Future Directions**:  \n   - The document suggests further exploration of selective mechanisms (e.g., in 𝑨) and applications in areas like reinforcement learning.  \n\nIn summary, the core argument is that **selective SSMs offer a computationally efficient and flexible alternative to Transformers and traditional SSMs, with strong empirical results and interpretable dynamics**.

In [None]:
answer = rag_ask("What is the main idea of the document?", vector_store, model)
print(answer)

content='The main idea of the document revolves around **selective state space models (SSMs)** and their advantages over traditional models like Transformers and linear time-invariant (LTI) SSMs. Key points include:\n\n1. **Selectivity in SSMs**:  \n   - The document highlights how introducing selectivity (e.g., through the parameter Δ) allows SSMs to dynamically focus on or ignore inputs, improving performance with minimal parameter overhead.  \n   - A small Δ retains the state and ignores the current input, while a large Δ resets the state and focuses on the input (similar to RNN gates).  \n\n2. **Performance Improvements**:  \n   - Increasing the state size (𝑁) in selective SSMs leads to significant perplexity improvements (over 1.0) with only a 1% increase in parameters.  \n   - Selective SSMs outperform recurrent baselines (e.g., RWKV, RetNet) on long sequences (e.g., 8k context length) due to better efficiency.  \n\n3. **Interpretation of Parameters**:  \n   - **Δ**: Governs inpu