# 测LLM

In [None]:
## 实验输入输出
import pandas as pd
eval_df = pd.read_json("../resources/data/cleaned/Dataset-of-financial-news-sentiment-classification.jsonl", lines=True)
eval_df = eval_df[eval_df['split'] == 'test'].reset_index()
pred_path =  "results/情感提取-qwen2_5_3B-sft.xlsx"
eval_output_path = "results/情感提取-qwen2_5_3B-sft-eval.txt"

## 预测

In [None]:
from openai import OpenAI
import pandas as pd
import os

test_model = OpenAI(base_url="http://localhost:12234/v1",api_key="empty")
test_model_name = "lora" 

## warm up
chat_completion = test_model.chat.completions.create(
    model=test_model_name,
    temperature=0.1, top_p=0.9, 
    messages=[{"role": "user","content": "字节跳动是什么时候成立的？"}],
)
print(chat_completion.choices[0].message.content) 

字节跳动成立于2012年。


In [None]:
if os.path.exists(pred_path):
    print("结果文件已经存在，跳过预测。")
else:
    print("预测...")
    for i, row in eval_df.iterrows():
        input_msg = [dict(role="system",content="判断以下文本情绪属于积极还是消极。"),dict(role="user",content=row['text'])]
        chat_completion = test_model.chat.completions.create(model=test_model_name,temperature=0.1, top_p=1, messages=input_msg)
        pred = chat_completion.choices[0].message.content
        eval_df.loc[i,"prediction"] = pred
        print(f"{i} pred: {pred}")
    eval_df.to_excel(pred_path)

## 评估

In [None]:
# LLM评估综述 https://blog.csdn.net/m0_59164304/article/details/142148468
# Deep-Eval简介 https://blog.csdn.net/lovechris00/article/details/143783278
# Deep-Eval官网 https://docs.confident-ai.com/docs/getting-started
import pandas as pd
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric,GEval
from deepeval.test_case import LLMTestCase
from langchain.chat_models import ChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCaseParams

class LLM(DeepEvalBaseLLM):
    def __init__(
        self,
        base_url,
        model_name,
        openai_api_key,
    ):
        self.model = ChatOpenAI(base_url=base_url,model_name=model_name,openai_api_key=openai_api_key)

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom vllm Server Model"

judger = LLM(base_url="http://localhost:12235/v1",model_name='judger',openai_api_key="empty")

correctness_metric = GEval(
    name="Correctness",
    criteria="\
    Determine whether the actual output correctly reflects the sentiment (positive or negative) based on the expected output. \
    If the actual output is ambiguous or does not clearly indicate a positive or negative sentiment, it is considered incorrect. \
    Specifically, the output is correct if: (1) it is '1' or a clear positive sentiment expression and the expected output is '1', or (2) it is '0' or a clear negative sentiment expression and the expected output is '0'. \
    Any other output, including ambiguous expressions, is considered incorrect.\
    ",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
)

pred_df = pd.read_excel(pred_path)
testcases=pred_df.apply(lambda row: 
    LLMTestCase(input=row['instruction']+row['input'],actual_output=row['prediction'],expected_output=row['output']),
    axis=1
)
predset = EvaluationDataset(test_cases=testcases)
start_time = time.time()  # 开始计时
results = evaluate(predset, [correctness_metric], ignore_errors=True, write_cache=False)
end_time = time.time()  # 结束计时

In [None]:
from deepeval.evaluate import aggregate_metric_pass_rates
with open(eval_output_path, 'w', encoding='utf-8') as f:
    f.write(f"Execution time: {end_time - start_time:.2f} seconds\n")
    final_output = aggregate_metric_pass_rates(results.test_results)
    f.write(str(final_output)+'\n')
    for test_result in results.test_results:
        output = "=" * 70 + "\n"
        output += f"Test Case: {test_result.name}\n"
        output += f"is success: {test_result.success}\n"
        output += f"metrics: {test_result.metrics_data}\n\n"
        f.write(output)

# 测BERT

In [1]:
## 实验输入输出
import pandas as pd
eval_df = pd.read_json("../resources/data/Dataset-of-financial-news-sentiment-classification.jsonl", lines=True)
eval_df = eval_df[eval_df['split'] == 'test'].reset_index()
pred_path =  "results/情感提取-FinBert.xlsx"

## 预测

In [2]:
import sys
sys.path.append("..")
from utils.models import *
from transformers import AutoTokenizer, AutoModel

model_dir = "../resources/open_models/FinBert"
ckpt_dir = '../resources/ckpts/FinBert'
backbone = AutoModel.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = BaseModel.use_subclass("bert_classifier")(backbone,2)
model.load_classifier(ckpt_dir)

# warmup
seqs = ["盛运环保2月13日晚间发布公告称，截至目前，共有37.48亿元到期债务未清偿。", "真好啊"]
input_tokens = tokenizer(seqs, return_tensors="pt", padding=True, truncation=True, max_length=512)
print(model.pred(input_tokens))

if os.path.exists(pred_path):
    print("结果文件已经存在，跳过预测。")
else:
    print("预测...")
    seqs = eval_df['text'].to_list()
    input_tokens = tokenizer(seqs, return_tensors="pt", padding=True, truncation=True, max_length=512)
    eval_df['prediction'] = model.pred(input_tokens)
    eval_df.to_excel(pred_path)

  from .autonotebook import tqdm as notebook_tqdm


Classifier weights loaded from ../resources/ckpts/FinBert/classifier_weights.pth
(array([[0.8732003 , 0.1267997 ],
       [0.00231368, 0.99768627]], dtype=float32), ['NEGATIVE', 'POSITIVE'])
预测...


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.52 GiB. GPU 0 has a total capacity of 23.64 GiB of which 671.69 MiB is free. Process 587335 has 2.80 GiB memory in use. Process 611891 has 12.25 GiB memory in use. Including non-PyTorch memory, this process has 7.93 GiB memory in use. Of the allocated memory 7.44 GiB is allocated by PyTorch, and 46.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 评估

In [None]:
import pandas as pd
pred_df = pd.read_excel(pred_path)
match = pred_df['prediction'] == pred_df['label']
accuracy = match.mean() * 100
print(f"Acc: {accuracy:.2f}%")