In [None]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import transformers

from sklearn.metrics import precision_score,recall_score,f1_score

import sys
sys.path.append("..")

import os
import uuid

In [None]:
seed = 1234

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
transformer_name = '../models/finbert'

tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)
## usage：
# tokenizer("hello world!")
# tokenizer.tokenize("hello world!")
# tokenizer.encode("hello world!")
# tokenizer.convert_ids_to_tokens(tokenizer.encode("hello world"))

In [None]:
dataset = datasets.load_dataset("../data/finan_news_senti_data", split=["train", "test"])
dataset[0][0]

In [None]:
# 筛选
# dataset = [part_data.filter(lambda example: example["task"]=="FINFE" and ("积极" in example["output"] or "消极" in example["output"]) for part_data in dataset]

# input生成
def tokenize_and_numericalize_example(example, tokenizer):
    ids = tokenizer(example["正文"], truncation=True)["input_ids"]
    return {"ids": ids, "label": example['正负面']}

train_data = dataset[0].map(
    tokenize_and_numericalize_example, fn_kwargs={"tokenizer": tokenizer}
)
test_data = dataset[1].map(
    tokenize_and_numericalize_example, fn_kwargs={"tokenizer": tokenizer}
)

In [None]:
pad_index = tokenizer.pad_token_id

In [None]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [None]:
train_data = train_data.with_format(type="torch", columns=["ids", "label"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label"])
test_data = test_data.with_format(type="torch", columns=["ids", "label"])

In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "label": batch_label}
        return batch

    return collate_fn

In [None]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [None]:
batch_size = 8

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [None]:
class Transformer(nn.Module):
    def __init__(self, transformer, output_dim, freeze):
        super().__init__()
        self.transformer = transformer
        hidden_dim = transformer.config.hidden_size
        self.fc = nn.Linear(hidden_dim, output_dim)
        if freeze:
            for param in self.transformer.parameters():
                param.requires_grad = False

    def forward(self, ids, attention_mask=None):
        # ids = [batch size, seq len]
        output = self.transformer(ids, output_attentions=False, attention_mask = attention_mask)
        hidden = output.last_hidden_state
        # hidden = [batch size, seq len, hidden dim]
        # attention = output.attentions[-1]
        # attention = [batch size, n heads, seq len, seq len]
        cls_hidden = hidden[:, 0, :]
        prediction = self.fc(torch.tanh(cls_hidden))
        # prediction = [batch size, output dim]
        return prediction

In [None]:
output_dim = 2
freeze = True
transformer = transformers.AutoModel.from_pretrained(transformer_name)
model = Transformer(transformer, output_dim, freeze)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"The model has {count_parameters(model):,} trainable parameters")

In [None]:
def train(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        attention_mask = (ids != pad_index).float().to(device)  # 生成attention_mask
        # print(ids.shape,attention_mask.shape)
        prediction = model(ids,attention_mask=attention_mask)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def evaluate(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            attention_mask = (ids != pad_index).float().to(device)  # 生成attention_mask
            prediction = model(ids,attention_mask=attention_mask)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
            all_labels.extend(label.cpu().numpy())
            all_predictions.extend(torch.argmax(prediction, dim=1).cpu().numpy())
    
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    return np.mean(epoch_losses), np.mean(epoch_accs), f1

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
def save_checkpoints(model,path,file_name,uid):
    torch.save(model.state_dict(), os.path.join(path,uid+"_"+file_name))

In [None]:
# 超参数    
lr = 2e-5
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()   # nn.CrossEntropyLoss()  nn.MSELoss()      
## warm up + 余弦退火
warmup_lr = 5e-7
warmup_epochs = 5
n_epochs = 70
scheduler = transformers.get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=warmup_epochs,num_training_steps=n_epochs)
# info cache
metrics = collections.defaultdict(list) 
best_valid_loss = float("inf") 
uid = str(uuid.uuid4())[:5]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)

In [None]:
for epoch in range(n_epochs):
    # 更新学习率
    try:
        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
    except NameError:
        pass

    train_loss, train_acc = train(train_data_loader, model, criterion, optimizer, device)
    valid_loss, valid_acc, f1 = evaluate(valid_data_loader, model, criterion, device)

    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    metrics["f1_score"].append(f1)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_checkpoints(model,"../outputs","bert_sentiment.pt",uid)
    print(f"epoch: {epoch}")
    print(f"Current learning rate: {current_lr}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")
    print(f"valid_f1: {f1:.3f}")

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_losses"], label="train loss")
ax.plot(metrics["valid_losses"], label="valid loss")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
# ax.set_xticks(range(n_epochs))
ax.legend()
ax.grid()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_accs"], label="train accuracy")
ax.plot(metrics["valid_accs"], label="valid accuracy")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
# ax.set_xticks(range(n_epochs))
ax.legend()
ax.grid()

In [None]:
# model.load_state_dict(torch.load("transformer.pt"))

test_loss, test_acc, test_f1 = evaluate(test_data_loader, model, criterion, device)
print(f"test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}, test_f1: {test_f1:.3f}")

In [None]:
def predict_sentiment(text, model, tokenizer, device):
    ids= tokenizer(text)["input_ids"]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

In [None]:
text = "迪威迅股权质押违约 控股股东或被动减持"
# text = "银行股全线飘绿 平安银行跌4%"

predict_sentiment(text, model, tokenizer, device)

In [None]:
text = "平安银行理财子公司获批筹建：注册资本50亿 股份行第五家"

predict_sentiment(text, model, tokenizer, device)