In [2]:
import torch
print(torch.__version__)

2.4.0+cu121


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import Dataset

class BertBinaryClassifier(nn.Module):
    def __init__(self, bert_path_or_name, num_labels=2):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_path_or_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
        # 冻结 BERT 的所有参数
        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # 获取 BERT 的输出
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # 使用 [CLS] token 的输出
        cls_output = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)
        # 通过分类头
        logits = self.classifier(cls_output)  # (batch_size, num_labels)
        return logits

class CLSDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=1280):
        """
        初始化数据集

        Args:
            texts (list): 包含文本的列表
            labels (list): 文本对应的标签列表
            tokenizer (PreTrainedTokenizer): 分词器实例
            max_length (int): 文本最大长度，超过此长度将会截断
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        获取数据样本，并将文本转为 BERT 可用的输入格式

        Args:
            idx (int): 索引值

        Returns:
            dict: 包含 `input_ids`, `attention_mask`, `labels` 的字典
        """
        text = self.texts[idx]
        label = self.labels[idx]

        # 对文本进行编码
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # 获取编码结果并移除不必要的维度
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [5]:
model_path = "../models/FinBert"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertBinaryClassifier(model_path)

In [21]:
import pandas as pd
import os

data_name = "Dataset-of-financial-news-sentiment-classification"
if os.path.exists(f"../data/test/{data_name}.csv"):
    print("已处理过数据, 直接加载...")
    data_path = "../data"
    train_data = pd.read_csv(f"../data/train/{data_name}.csv")
    test_data = pd.read_csv(f"../data/test/{data_name}.csv") 
    df = pd.concat([train_data,test_data], ignore_index=True)
else:
    print("加载原始数据集...")
    train_data = pd.read_csv(f"../data/_raw/{data_name}/train_data.csv")
    test_data = pd.read_csv(f"../data/_raw/{data_name}/test_data.csv")
    get_text = lambda x: x['正文']
    get_label = lambda x: x['正负面']
    # 合并数据集
    df = pd.concat([
        train_data.assign(split='train', text=get_text, label=get_label),
        test_data.assign(split='test', text=get_text, label=get_label)
    ], ignore_index=True)[['split', 'text', 'label']]
    # 清洗
    df = df[df['text'].notna() & (df['text'] != '')]
    # 保存清洗后的数据
    df[df['split'] == 'train'].to_csv(f'../data/train/{data_name}.csv', index=False)
    df[df['split'] == 'test'].to_csv(f'../data/test/{data_name}.csv', index=False)

train_ds = CLSDataset(
    tokenizer=tokenizer,
    texts = df.query("split == 'train'")['text'].to_list(),
    labels = df.query("split == 'train'")['label'].to_list(),
)

已处理过数据, 直接加载...


In [31]:
print(df.loc[0,'text'])
print(tokenizer.decode(train_ds[0]['input_ids'].tolist()))

盛运环保2月13日晚间发布公告称，截至目前，共有37.48亿元到期债务未清偿。
[CLS] [UNK] [UNK] [UNK] 保 2 月 13 日 [UNK] [UNK] [UNK] [UNK] 公 [UNK] [UNK] ， [UNK] [UNK] 目 前 ， [UNK] 有 37. 48 [UNK] 元 [UNK] [UNK] [UNK] [UNK] [UNK] 清 [UNK] 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
optimizer = AdamW(model.classifier.parameters(), lr=2e-5)  # 只更新分类头的参数
criterion = nn.CrossEntropyLoss()

In [None]:
## train

# 前向传播和计算损失
logits = model(input_ids, attention_mask)
loss = criterion(logits, labels)

# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()

print(f"Loss: {loss.item()}")
