In [1]:
%env QWEN_API_KEY=替换为自己的
%env QWEN_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1

env: QWEN_API_KEY=替换为自己的
env: QWEN_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1


In [2]:
%%capture --no-stderr
!pip install -U langchain langchain_community pypdf sentence_transformers chromadb trulens_eval langchain_openai

In [2]:
import langchain, langchain_community, pypdf, sentence_transformers, chromadb, trulens_eval, langchain_openai

for module in (langchain, langchain_community, langchain_openai, pypdf, sentence_transformers, chromadb, trulens_eval):
    print(f"{module.__name__:<30}{module.__version__ if hasattr(module, '__version__') else ''}")

2024-07-27 19:51:25.406869: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-27 19:51:25.438804: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


langchain                     0.2.7
langchain_community           0.2.7
langchain_openai              
pypdf                         4.2.0
sentence_transformers         2.7.0
chromadb                      0.5.3
trulens_eval                  0.33.0


In [3]:
!pip list |grep -i langchain-openai

langchain-openai                                  0.1.7


In [4]:
import os
import pandas as pd

In [11]:
# 如果已经下载到本地，可以替换为本地路径
EMBEDDING_MODEL_PATH = 'BAAI/bge-large-zh-v1.5'
dt = '20240713'
version = 'v1'

output_dir = os.path.join(os.path.pardir, 'outputs', f'{version}_{dt}')

加载问答对

In [6]:
qa_df = pd.read_excel(os.path.join(output_dir, 'question_answer.xlsx'))

# 文档处理

In [7]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(os.path.join(os.path.pardir, 'data', '2024全球经济金融展望报告.pdf'))
documents = loader.load()

In [8]:
from uuid import uuid4
import os
import pickle

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

def split_docs(documents, filepath, chunk_size=400, chunk_overlap=40, seperators=['\n\n\n', '\n\n'], force_split=False):
    if os.path.exists(filepath) and not force_split:
        print('found cache, restoring...')
        return pickle.load(open(filepath, 'rb'))

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=seperators
    )
    split_docs = splitter.split_documents(documents)
    for chunk in split_docs:
        chunk.metadata['uuid'] = str(uuid4())

    pickle.dump(split_docs, open(filepath, 'wb'))

    return split_docs

In [9]:
splitted_docs = split_docs(documents, os.path.join(output_dir, 'split_docs.pkl'), chunk_size=500, chunk_overlap=50)

found cache, restoring...


向量化

In [12]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

device: cuda


In [13]:
from tqdm.auto import tqdm

def get_vector_db(docs, store_path, force_rebuild=False):
    if not os.path.exists(store_path):
        force_rebuild = True

    if force_rebuild:
        vector_db = Chroma.from_documents(
            docs,
            embedding=embeddings,
            persist_directory=store_path
        )
    else:
        vector_db = Chroma(
            persist_directory=store_path,
            embedding_function=embeddings
        )
    return vector_db

In [14]:
vector_db = get_vector_db(splitted_docs, store_path=os.path.join(os.path.pardir, output_dir, 'chromadb', 'bge_large_v1.5'))

# 问答全流程

In [15]:
from langchain.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

llm = Ollama(
    model='qwen2:7b-instruct',
    base_url="http://localhost:11434"
)

prompt_tmpl = """
你是一个金融分析师，擅长根据所获取的信息片段，对问题进行分析和推理。
你的任务是根据所获取的信息片段（<<<<context>>><<<</context>>>之间的内容）回答问题。
回答保持简洁，不必重复问题，不要要添加描述性解释和与答案无关的任何内容。
已知信息：
<<<<context>>>
{context}
<<<</context>>>

问题：{question}
请回答：
"""
prompt = PromptTemplate.from_template(prompt_tmpl)
retriever = vector_db.as_retriever(search_kwargs={'k': 4})

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
print(rag_chain.invoke('2023年10月美国ISM制造业PMI指数较上月有何变化？'))

2023年10月美国ISM制造业PMI指数较上个月大幅下降了2.3个百分点。


# 评估

## 准备测试集

In [17]:
prediction_df = qa_df[qa_df['dataset'] == 'test'][['uuid', 'question', 'qa_type', 'answer']]

## 初始化Feedback函数

In [18]:
from langchain_openai import ChatOpenAI

llm_chain = ChatOpenAI(
    api_key=os.environ['QWEN_API_KEY'],
    base_url=os.environ['QWEN_BASE_URL'],
    model_name='qwen2-72b-instruct'
)

或者也可以使用Ollama提供的模型

In [19]:
# from langchain.llms import Ollama

# llm_chain = Ollama(
#     model='qwen2:7b-instruct',
#     base_url="http://192.168.31.92:11434"
# )

In [20]:
llm_chain.invoke('你是谁')

AIMessage(content='我是阿里云开发的一款超大规模语言模型，我叫通义千问。', response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 10, 'total_tokens': 27}, 'model_name': 'qwen2-72b-instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-90e78d95-c2ff-4560-b08a-485f33df9d5e-0')

In [21]:
from trulens_eval.app import App
from trulens_eval import Feedback

import numpy as np
from langchain_openai import OpenAI
from trulens_eval.feedback.provider import Langchain as LangchainProvider

provider = LangchainProvider(chain=llm_chain)
context = App.select_context(rag_chain)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name="Groundedness")
    .on(context.collect()) # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)
# Context relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance")
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content .


## 使用TruLens记录的Instrument chain

In [22]:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [23]:
tru_recorder = TruChain(
    rag_chain,
    app_id='Baseline',
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness]
)

In [24]:
answer_dict = {}

for idx, row in tqdm(prediction_df.iterrows(), total=len(prediction_df)):
    with tru_recorder as recording:
        uuid = row['uuid']
        question = row['question']
        answer = rag_chain.invoke(question)
        answer_dict[question] = {
            'uuid': uuid,
            'ref_answer': row['answer'],
            'gen_answer': answer
        }

  0%|          | 0/100 [00:00<?, ?it/s]

  warn_deprecated(


In [25]:
prediction_df.loc[:, 'gen_answer'] = prediction_df['question'].apply(lambda q: answer_dict[q]['gen_answer'])

In [26]:
prediction_df.sample(5)

Unnamed: 0,uuid,question,qa_type,answer,gen_answer
109,1c1d5705-f666-4447-8d61-6c184710e67f,根据英国预算责任办公室的说法，英国民众生活标准何时会开始恢复？,detailed,2024年上半年以后，经济活动有望逐步复苏,预计未来英国民众以可支配收入衡量的生活标准在下一财年仍将较疫情前水平低3.5%。具体时间并未...
258,28c7d4f0-8cea-48b3-9cfa-6683446fd425,海湾六国经济增长受什么影响较大？,detailed,国际能源价格走势,海湾六国经济增长受国际能源价格走势影响较大，特别是在2022年，全球油气价格的飙升推动了其经...
22,1f406690-b478-43cd-96f8-cd77924e300e,哪些地区在2023年预计经济增速加快？,detailed,中东欧国家经济增速预计加快。,中东欧国家在2023年预计经济增速加快。
16,1f406690-b478-43cd-96f8-cd77924e300e,全球经济复苏呈现什么特点？,detailed,全球经济复苏不均衡，各国差异大。发达经济体增速放缓，新兴经济体增速基本持平但略有下降。,全球经济复苏呈现出不均衡的特点，各国之间存在较大差异。发达经济体增速明显放缓，预计2023年...
38,4bd96918-ce66-4762-a725-8a7e4b543dbe,2023年三季度美国住宅投资的情况如何？,detailed,恢复增长,根据文中内容，2023年三季度美国私人消费和政府财政支出是支撑经济增长的重要驱动力。其中，私...


## 检查结果

In [27]:
tru.get_leaderboard()

Unnamed: 0_level_0,Groundedness,Context Relevance,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,0.85756,0.595588,0.947561,3.25,0.0


In [29]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.31.92:48913



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>