# llamaindex 评估


In [None]:
import chromadb
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    get_response_synthesizer,
    Settings,
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [None]:
# 设置嵌入模型和语言模型
Settings.embed_model = OllamaEmbedding(model_name="yxl/m3e:latest")
# Settings.llm = Ollama(model="qwen2.5:0.5b", request_timeout=360)
llm = Ollama(model="qwen2.5:0.5b", request_timeout=120.0)

In [None]:
# 初始化 Chroma 客户端，指定数据存储路径为当前目录下的 chroma_db 文件夹
db = chromadb.PersistentClient(path="./chroma_db")

# 获取或创建名为 "quickstart" 的集合，如果该集合不存在，则创建它
chroma_collection = db.get_or_create_collection("quickstart")

# 使用上述集合创建一个 ChromaVectorStore 实例，以便 llama_index 可以与 Chroma 集合进行交互
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# 创建一个存储上下文，指定向量存储为刚刚创建的 ChromaVectorStore 实例
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# 读取文档
documents = SimpleDirectoryReader("C:/Users/Admin/Desktop/Data/").load_data()

# 构建索引
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    transformations=[SentenceSplitter(chunk_size=256)],
)

In [None]:
# 配置检索器
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,  # 返回最相似的前 n 个文档片段
)

# 配置响应合成器
response_synthesizer = get_response_synthesizer()

# 组装查询引擎
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

评估 RAG 应用需要用到几个评估实体，分别是：

- Question: 指用户输入的问题，RAG 应用通过问题检索到相关的文档上下文
- Context: 指检索到的文档上下文，RAG 应用检索到相关文档后会将这些上下
  文结合用户问题一起提交给 LLM，最后生成答案
- Answer: 指生成的答案，RAG 应用将问题和上下文提交给 LLM 后，LLM 会
  根据这些信息来生成答案
- Grouth Truth: 指人工标注的正确答案，利用这个实体可以对生成的答案进
  行分析，从而得到评估结果，在 LlamaIndex 中，这个实体叫做 Reference Answer

其中 Question 和 Ground Truth 通过用户提供，Context 通过检索得到，Answer
是由 LLM 生成，后面我们在讲解的时候会沿用这些实体名称。在 LlamaIndex 中提供
了生成测试数据集的功能，可以帮助我们快速生成测试数据集，无需人工干预。


In [None]:
from llama_index.core.evaluation import BatchEvalRunner
from llama_index.core.evaluation import ContextRelevancyEvaluator
from llama_index.core.evaluation import AnswerRelevancyEvaluator
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.evaluation import CorrectnessEvaluator
from llama_index.core.evaluation import PairwiseComparisonEvaluator
from llama_index.core.evaluation import RelevancyEvaluator

answer_relevancy_evaluator = AnswerRelevancyEvaluator(llm)
context_relevancy_evaluator = ContextRelevancyEvaluator(llm)
relevant_evaluator = RelevancyEvaluator(llm)
correctness_evaluator = CorrectnessEvaluator(llm)
faithfulness_evaluator = FaithfulnessEvaluator(llm)
pairwiseComparisonEvaluator = PairwiseComparisonEvaluator(llm)

runner = BatchEvalRunner(
    evaluators={
        "answer_relevancy": answer_relevancy_evaluator,
        "context_relevancy": context_relevancy_evaluator,
        "relevancy": relevant_evaluator,
        "correctness": correctness_evaluator,
        "faithfulness": faithfulness_evaluator,
        "pairwiseComparisonEvaluator": pairwiseComparisonEvaluator,
    },
    workers=8,
)

In [None]:
node_parser = SentenceSplitter()
nodes = node_parser.get_nodes_from_documents(documents)
Settings.llm = llm
vector_index = VectorStoreIndex(nodes)
engine = vector_index.as_query_engine()
response = engine.query(question)
answer = str(response)

In [None]:
questions = [example.query for example in examples]
ground_truths = [example.reference_answer for example in examples]
metrics_results = runner.evaluate_queries(
    engine, queries=questions, reference=ground_truths
)

for metrics in metrics_results.keys():
    print(f"metrics: {metrics}")
    eval_results = metrics_results[metrics]
    for eval_result in eval_results:
        print(f"score: {eval_result.score}")
        print(f"feedback: {eval_result.feedback}")
        if eval_result.passing is not None:
            print(f"passing: {eval_result.passing}")