In [1]:
## 实验输入输出
import pandas as pd
eval_df = pd.read_json("../resources/data/cleaned/FinCUGE.jsonl", lines=True)
eval_df = eval_df[(eval_df['task']== 'FINNA') & (eval_df['split'] == 'eval')].reset_index()
pred_path =  "results/新闻摘要-qwen2_5_3B.xlsx"

## 启动被测模型的vllm服务

vllm serve resources/open_models/Qwen2.5-3B-Instruct --trust-remote-code \
--served-model-name test \
--max-model-len 3072 \
--tensor-parallel-size 4 --gpu-memory-utilization 0.15 \
--dtype bfloat16 --quantization fp8 \
--port 12234


## 启动评审模型的vllm服务

vllm serve resources/open_models/Qwen2.5-3B-Instruct  --trust-remote-code \
--served-model-name judger \
--max-model-len 3072 \
--tensor-parallel-size 4 --gpu-memory-utilization 0.15 \
--dtype bfloat16 --quantization fp8 \
--port 12235

## 预测

In [None]:
from openai import OpenAI
import pandas as pd
import os

test_model = OpenAI(base_url="http://localhost:12234/v1",api_key="empty")

## warm up
chat_completion = test_model.chat.completions.create(
    model="test",
    temperature=0.1, top_p=0.9, 
    messages=[
        {
            "role": "user",
            "content": "字节跳动是什么时候成立的？",
        }
    ],
)
print(chat_completion.choices[0].message.content) 

if os.path.exists(pred_path):
    print("结果文件已经存在，跳过预测。")
else:
    print("预测...")
    for i, row in eval_df.iterrows():
        input_msg = [dict(role="system",content=row['instruction']),dict(role="user",content=row['input'])]
        chat_completion = test_model.chat.completions.create(model="test",temperature=0.1, top_p=1, messages=input_msg)
        pred = chat_completion.choices[0].message.content
        eval_df.loc[i,"prediction"] = pred
        print(f"{i} pred: {pred}")
    eval_df.to_excel(pred_path)

## 评估

In [2]:
# LLM评估综述 https://blog.csdn.net/m0_59164304/article/details/142148468
# Deep-Eval https://blog.csdn.net/lovechris00/article/details/143783278
import pandas as pd
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric,GEval
from deepeval.test_case import LLMTestCase
from langchain.chat_models import ChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCaseParams

class LLM(DeepEvalBaseLLM):
    def __init__(
        self,
        base_url,
        model_name,
        openai_api_key,
    ):
        self.model = ChatOpenAI(base_url=base_url,model_name=model_name,openai_api_key=openai_api_key)

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

judger = LLM(base_url="http://localhost:12235/v1",model_name='judger',openai_api_key="empty")

correctness_metric = GEval(
    name="Correctness",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradict any facts in 'expected output'. If there is a contradiction, the output is incorrect.",
        "Check if the 'actual output' omits any important details from the 'expected output'. If details are omitted, the output is less accurate.",
        "Evaluate the length of the 'actual output' compared to the 'expected output'. If the 'actual output' is significantly longer without adding substantial value, penalize the output.",
        "Vague language or contradicting opinions are acceptable as long as they do not introduce factual errors."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=judger
)

pred_df = pd.read_excel(pred_path)
testcases=pred_df.apply(lambda row: 
    LLMTestCase(input=row['instruction']+row['input'],actual_output=row['prediction'],expected_output=row['output']),
    axis=1
)
predset = EvaluationDataset(test_cases=testcases)

    
evaluate(predset, [correctness_metric])

  self.model = ChatOpenAI(base_url=base_url,model_name=model_name,openai_api_key=openai_api_key)


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 3600 test case(s) in parallel: |          |  0% (0/3600) [Time Taken: 00:00, ?test case/s]Queue is full, likely spans will be dropped.
Evaluating 3600 test case(s) in parallel: |██▌       | 26% (932/3600) [Time Taken: 03:46,  4.12test case/s]


ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.