In [26]:
import os
import logging
import pickle
from PyPDF2 import PdfReader
from langchain.chains.openai_functions.openapi import openapi_spec_to_openai_fn
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAI
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_community.callbacks.manager import get_openai_callback
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from typing import List, Tuple

from sympy import pprint


# from setuptools.sandbox import save_argv


def extract_text_with_page_numbers(pdf: PdfReader) -> Tuple[str, List[int]]:
    """
    从 PDF 中提取文本并记录每行文本对应的页码
    :param pdf:
    :return:
    """
    text = ""
    page_numbers = []

    for page_number, page in enumerate(pdf.pages, start=1):
        extracted_text = page.extract_text()
        if extracted_text:
            text += extracted_text
            page_numbers.extend([page_number] * len(extracted_text.split("\n")))
        else:
            logging.warning(f"No text found on page {page_number}")
    return text, page_numbers

def process_text_with_splitter(text: str, page_numbers: List[int], save_path: str = None) -> FAISS:
    """
    处理文本并创建向量存储
    :param text: 提取的文本内容
    :param page_numbers: 每行文本对应的页码列表
    :param save_path: 可选，保存向量数据库的路径
    :return: 基于 FAISS 的向量存储对象
    """
    # 创建文本分割器，用于将长文本分割成小块
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n", "\n\n", ".", " ", ""], chunk_size=512, chunk_overlap=128, length_function=len)

    # 分割文本
    chunks = text_splitter.split_text(text)
    print(f"文本被分割成 {len(chunks)} 个块")

    # openai 的嵌入模型
    # ai_embeddings = OpenAIEmbeddings
    # 阿里的嵌入模型
    embeddings = DashScopeEmbeddings(model="text-embedding-v2")

    # 从文本块创建知识库
    knowledgeBase = FAISS.from_texts(chunks, embeddings)
    print("已从文本块创建知识库")

    # 存储每个文件块对应的页码信息
    page_info = {chunk: page_numbers[i] for i, chunk in enumerate(chunks)}
    knowledgeBase.page_info = page_info

    if save_path:
        # 确保目录存在
        os.makedirs(save_path, exist_ok=True)

        # 保存 FAISS 向量数据库
        knowledgeBase.save_local(save_path)
        print(f"向量数据库已保存到： {save_path}")

        # 保存页码信息到同一目录
        with open(os.path.join(save_path, "page_info.pkl"), "wb") as f:
            pickle.dump(page_info, f)
        print(f"页码信息已保存到：{os.path.join(save_path, 'page_info.pkl')}")

    return knowledgeBase

def load_knowledge_base(load_path: str, embeddings = None) -> FAISS:
    """
    从磁盘加载向量数据库和页码信息
    :param load_path: 向量数据库的保存路径
    :param embeddings: 可选，嵌入模型，如果为 None，则创建一个新的 DashScopeEmbedding 模型
    :return: 加载的 FAISS 向量数据库对象
    """
    # 如果没有提供嵌入模型，则创建一个新的
    if embeddings is None:
        embeddings = DashScopeEmbeddings(model="text-embedding-v2")

    # 加载 FAISS 向量数据库，添加 allow_dangerous_deserialization 参数允许反序列化
    knowledge_base = FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
    print(f"向量数据库已从{load_path}加载")

    # 加载页码信息
    page_info_path = os.path.join(load_path, "pae_info.pkl")
    if os.path.exists(page_info_path):
        with open(page_info_path, "rb") as f:
            page_info = pickle.load(f)
        knowledge_base.page_info = page_info
        print("页码信息已加载")
    else:
        print("未找到页码信息文件")
    return knowledge_base

pdf_reader = PdfReader("./浦发上海浦东发展银行西安分行个金客户经理考核办法.pdf")
# 提取文本和页码信息
text, page_numbers = extract_text_with_page_numbers(pdf_reader)

text
page_numbers

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9]

In [27]:
import os
os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY")
print(f"提取的文本长度：{len(text)} 个字符")

# 处理文本并创建知识库，同时保存到磁盘
save_dir = "./vector_db"
knowledge_base = process_text_with_splitter(text, page_numbers, save_path=save_dir)

# 处理文本并创建知识库
knowledge_base = process_text_with_splitter(text, page_numbers)

knowledge_base

提取的文本长度：3881 个字符
文本被分割成 10 个块
已从文本块创建知识库
向量数据库已保存到： ./vector_db
页码信息已保存到：./vector_db/page_info.pkl
文本被分割成 10 个块
已从文本块创建知识库


<langchain_community.vectorstores.faiss.FAISS at 0x147319a30>

In [29]:
# 设置查询问题
query = "客户经理被投诉了，投诉一次扣多少分"
# query = "客户经理每年评聘申报时间是怎样的"
if query:
    # 执行相似度搜索，找到与查询相关的文档
    docs = knowledge_base.similarity_search(query)

    # 初始化大模型
    llm = ChatOpenAI(model="qwen-plus", temperature=0, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key=os.getenv("DASHSCOPE_API_KEY"))

    chain = load_qa_chain(llm, chain_type="stuff")

    # 准备输入数据
    input_data = {"input_documents": docs, "question": query}

    # 使用回调函数跟踪 API 调用成本
    with get_openai_callback() as cb:
        response = chain.invoke(input=input_data)
        # pprint.pprint(response)
        print(f"查询已处理，成本：{cb}")
        print(response["output_text"])
        print("来源：")

    # 记录唯一的页码
    unique_pages = set()

    # 显示每个文档块的来源页码
    for doc in docs:
        text_content = getattr(doc, "page_content", "")
        source_page = knowledge_base.page_info.get(text_content.strip(), "未知")

        if source_page not in unique_pages:
            unique_pages.add(source_page)
            print(f"文本块页码：{source_page}")



查询已处理，成本：Tokens Used: 1357
	Prompt Tokens: 1337
	Completion Tokens: 20
Successful Requests: 1
Total Cost (USD): $0.0
根据提供的信息，客户经理如果被客户投诉，每次投诉会扣 **2分**。
来源：
文本块页码：1
