In [None]:
## 数据预处理
def custom_Dataset_of_financial_news_sentiment_classification(from_dir, to_dir):
    # https://github.com/wwwxmu/Dataset-of-financial-news-sentiment-classification
    import os
    import pandas as pd
    train_data = pd.read_csv(f"{from_dir}/train_data.csv")
    test_data = pd.read_csv(f"{from_dir}/test_data.csv")
    get_text = lambda x: x['正文']
    get_label = lambda x: x['正负面']
    # 合并数据集
    df = pd.concat([
        train_data.assign(split='train', text=get_text, label=get_label),
        test_data.assign(split='test', text=get_text, label=get_label)
    ], ignore_index=True)[['split', 'text', 'label']]
    # 清洗
    df = df[df['text'].notna() & (df['text'] != '')]
    # 保存清洗后的数据
    os.makedirs(to_dir, exist_ok=True)
    df.to_json(os.path.join(to_dir,'Dataset-of-financial-news-sentiment-classification.jsonl'), orient='records',force_ascii=False, lines=True)


In [None]:
## Configs
model_dir = "../../resources/open_models/FinBert"
ckpt_dir = '../../resources/ckpts/FinBert'
testset_path = "../../resources/data/Dataset-of-financial-news-sentiment-classification.jsonl"
pred_path =  "test_results/情感提取-pred.xlsx"

pred_batch_size = 16


In [None]:
import pandas as pd
eval_df = pd.read_json(testset_path, lines=True)
eval_df = eval_df[eval_df['split'] == 'test'].reset_index()

In [None]:
import sys; sys.path.append("../..")
from utils.models import *
from transformers import AutoTokenizer, AutoModel

backbone = AutoModel.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = BaseModel.use_subclass("bert_classifier")(backbone,2)
model.load_classifier(ckpt_dir)

Classifier weights loaded from ../../resources/ckpts/FinBert/classifier_weights.pth


## 预测

In [None]:
from tqdm import tqdm
import numpy as np
# warmup
seqs = ["盛运环保2月13日晚间发布公告称，截至目前，共有37.48亿元到期债务未清偿。", "真好啊"]
input_tokens = tokenizer(seqs, return_tensors="pt", padding=True, truncation=True, max_length=512)
print(model.pred(input_tokens))

if os.path.exists(pred_path):
    print("结果文件已经存在，跳过预测。")
else:
    print("预测...")
    seqs = eval_df['text'].tolist()
    pred_labels = []
    probabilities = []
    

    for i in tqdm(range(0, len(seqs), pred_batch_size)):
        batch_seqs = seqs[i : i + pred_batch_size]
        
        input_tokens = tokenizer(
            batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=512
        ).to(model.device)  # 确保数据在同一设备上
        
        batch_probs, batch_labels = model.pred(input_tokens) 
        pred_labels.extend(batch_labels)
        probabilities.append(batch_probs) 

    probabilities = np.vstack(probabilities)  # shape: (num_samples, num_classes)
    for i in range(probabilities.shape[1]):
        eval_df[f'prob_class_{i}'] = probabilities[:, i]
    eval_df['prediction'] = pred_labels
    eval_df.to_excel(pred_path)

(array([[0.8732003 , 0.1267997 ],
       [0.00231368, 0.99768627]], dtype=float32), ['NEGATIVE', 'POSITIVE'])
预测...


100%|██████████| 150/150 [00:07<00:00, 20.38it/s]


## 评估

In [None]:
import pandas as pd
label_mapper = {
    0: "NEGATIVE",
    1: "POSITIVE"
}
pred_df = pd.read_excel(pred_path)
match = pred_df['prediction'] == pred_df['label'].apply(lambda x: label_mapper[x])
accuracy = match.mean() * 100
print(f"Acc: {accuracy:.2f}%")

Acc: 91.50%
