In [10]:
# data
# from datasets import load_dataset
# ds = load_dataset('csv',data_files="data/ChnSentiCorp.csv")

from torch.utils.data import Dataset
from datasets import load_from_disk

class BizDataset(Dataset):
    def __init__(self,split):
        self.dataset = load_from_disk(r"D:\Workspace\regression\huggingface\dataset")
        if split=="train":
            self.dataset=self.dataset["train"]
        elif split=="test":
            self.dataset=self.dataset["test"]
        elif split=="validation":
            self.dataset=self.dataset["validation"]
        else:
            print("split is wrong")
           
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self,item):
        text = self.dataset[item]["text"]
        label = self.dataset[item]["label"]
        return text,label

In [11]:
# net
from transformers import BertModel
import torch

DEVICE= torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name="bert-base-chinese"
model_name=r"D:\Workspace\regression\huggingface\model\bert-base-chinese\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f"

pretrained = BertModel.from_pretrained(model_name).to(DEVICE)
print(pretrained)

# 如需进行定制化，需要调整对应的输入输出，保持词向量一致
print(pretrained.embeddings.word_embeddings)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

```
import torch
from transformers import BertModel, BertTokenizer

# 加载预训练模型和分词器
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 输入文本
text = "Hello, how are you?"

# 分词并编码
inputs = tokenizer(text, return_tensors="pt")

# 获取模型输出
with torch.no_grad():
    outputs = model(**inputs)

# 提取 [CLS] token 的隐藏状态
cls_embedding = outputs.last_hidden_state[:, 0]

print("CLS embedding shape:", cls_embedding.shape)  # 输出: (1, 768)

```

In [12]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768,2)
    # 前向推理运算
    def forward(self,input_ids,attention_mask,token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,attention_mask= attention_mask,token_type_ids=token_type_ids)
        
        out= self.fc(out.last_hidden_state[:,0])
        out = out.softmax(dim=1)
        return out

In [14]:
# trainer
from transformers import BertTokenizer,AdamW
from torch.utils.data import DataLoader

EPOCH=10
tokenizer = BertTokenizer.from_pretrained(model_name)

# 对数据进行编码处理
def collate_fn(data):
    sentes = [i[0] for i in data]
    labels = [i[1] for i in data]
    data = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=sentes,
        padding="max_length",  # 填充到最大长度
        truncation=True,       # 截断超过最大长度的序列
        max_length=30,         # 最大长度为 10
        return_length=True,
        return_tensors="pt",
    )
    input_ids = data["input_ids"]
    attention_mask = data["attention_mask"]
    token_type_ids = data["token_type_ids"]
    labels = torch.LongTensor(labels)
    
    return input_ids,attention_mask,token_type_ids,labels

# 1、创建数据集
train_ds= BizDataset("train")
# 2、创建数据加载器
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=8,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

#训练
if __name__=='__main__':
    print(DEVICE)
    model=Model().to(DEVICE)
    optimizer=AdamW(model.parameters(),lr=1e-3)    
    loss_func =torch.nn.CrossEntropyLoss()
    
    model.train()
    for epoch in range(EPOCH):
        for i ,(input_ids,attention_mask,token_type_ids,labels) in enumerate(train_loader):
            input_ids,attention_mask,token_type_ids,labels = input_ids.to(DEVICE),attention_mask.to(DEVICE),token_type_ids.to(DEVICE),labels.to(DEVICE)
            # 执行前向计算
            out= model(input_ids,attention_mask,token_type_ids)
            loss = loss_func(out,labels)
            
            # 深度学习优化模型三步走
            #1、清空权重梯度,2、反向传播，3、更新梯度
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if i%5 == 0:
                out = out.argmax(dim=1)
                acc = (out == labels).sum().item()/len(labels)
                print(epoch,i,loss.item(),acc)
        
        # 保存模型参数
        torch.save(model.state_dict(),f"params/{epoch}bert.pt")
        print(epoch,"参数保存成功。")    

cpu




TypeError: 'tuple' object is not callable