In [1]:
## Step1 导入相关包

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

In [3]:
## 加载数据集

In [6]:
dataset = load_dataset("csv", data_files='./datasets/ChnSentiCorp_htl_all.csv', split="train")
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset # 原本7766，如果指定了train之后，只加载train部分, 并过滤none的数据

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [7]:
## 划分数据集

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

In [14]:
## 创建dataloader

In [17]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

def process_function(examples):
    tokenizered_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenizered_examples['labels'] = examples['label']
    return tokenizered_examples

tokenizer_datasets = dataset.map(process_function, batched=True, remove_columns=dataset['train'].column_names)
tokenizer_datasets



Map:   0%|          | 0/6212 [00:00<?, ? examples/s]

Map:   0%|          | 0/1553 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

In [18]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenizer_datasets['train'], tokenizer_datasets['test']
trainloader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
validloader = DataLoader(validset, batch_size=8, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))


In [19]:
## 创建优化器和模型

In [20]:
from torch.optim import AdamW

model = AutoModelForSequenceClassification.from_pretrained('D:/AI/pretrain_model/models--hfl--rbt3').cuda()
optimizer = AdamW(model.parameters(), lr=2e-5)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at D:/AI/pretrain_model/models--hfl--rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
## 训练和验证

In [25]:
## 查看dataloader数据
type(next(iter(trainloader)))

transformers.tokenization_utils_base.BatchEncoding

In [None]:
def evalute():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch['labels'].long()).float().sum()
    return acc_num / len(validset)


def train(epoch, log_steps):
    global_step = 0

    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()

            if global_step % log_steps == 0:
                print(f"epoch:{ep}, global_step:{global_step}, current_loss:{output.loss.item()}")

            global_step += 1
        acc = evalute()
        print(f"epoch:{ep}, acc:{acc}")

In [27]:
## 模型训练

In [29]:
train(epoch=3, log_steps=100)

epoch:0, global_step:0, current_loss:0.6160242557525635
epoch:0, global_step:100, current_loss:0.29684317111968994
epoch:0, global_step:200, current_loss:0.6522433161735535
epoch:0, global_step:300, current_loss:0.16382309794425964
epoch:0, global_step:400, current_loss:0.06924654543399811
epoch:0, global_step:500, current_loss:0.04809744283556938
epoch:0, global_step:600, current_loss:0.8146895170211792
epoch:0, global_step:700, current_loss:0.5705182552337646
epoch:1, global_step:800, current_loss:0.08817816525697708
epoch:1, global_step:900, current_loss:0.043421510607004166
epoch:1, global_step:1000, current_loss:0.2129090130329132
epoch:1, global_step:1100, current_loss:0.1853337287902832
epoch:1, global_step:1200, current_loss:0.0816674530506134
epoch:1, global_step:1300, current_loss:0.06641850620508194
epoch:1, global_step:1400, current_loss:0.04982209950685501
epoch:1, global_step:1500, current_loss:0.34130963683128357
epoch:2, global_step:1600, current_loss:0.2430831193923950

In [30]:
## 模型预测

In [None]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2label = {0:'差评',1:'好评'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt')
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
  
    print(f"输入：{sen}\n模型预测结果:{id2label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评
