In [1]:
from transformers import BertTokenizer

#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-uncased')

token

2022-12-12 13:56:50.253462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 13:56:50.477460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-12 13:56:50.477491: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-12 13:56:51.404875: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-1

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [2]:
import torch
from datasets import load_dataset
import random


#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self, split):
        dataset = load_dataset(path='embedding-data/sentence-compression',
                               split='train')

        self.dataset = dataset.train_test_split(test_size=0.0005)[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        s1, s2 = self.dataset[i]['set']
        same = 1

        #有一半的概率把后半句替换为一句无关的话
        if random.random() > 0.5:
            s2 = random.choice(self.dataset)['set'][1]
            same = 0

        if random.random() > 0.5:
            s1, s2 = s2, s1

        return s1, s2, same


dataset = Dataset('train')

len(dataset), dataset[0]

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


(179910,
 ('The Herzliya wastewater treatment plant has been upgraded to tertiary treatment in accordance with requirements set by the interministerial permits committee for the discharge of waste to the sea and the recommendations of the Marine and Coastal Environment Division of the Ministry of Environmental Protection.',
  'Herzliya wastewater treatment plant upgraded',
  1))

In [3]:
def collate_fn(data):
    sents = [i[:2] for i in data]
    same = [i[2] for i in data]

    #编码
    data = token.batch_encode_plus(sents,
                                   truncation=True,
                                   padding=True,
                                   max_length=500,
                                   return_tensors='pt')

    same = torch.LongTensor(same)

    return data, same


#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (data, same) in enumerate(loader):
    break

len(loader), data, same

(22488,
 {'input_ids': tensor([[  101,  2402,  2308,  2007,  7987,  3540, 14494,  2514, 11543,  2055,
           7047,  2005,  6550,  1010, 10922,   102,  7643,  2015,  2741,  2055,
           2321,  1012,  1017,  2454,  3793,  7696,  2012,  2047,  2095,  1010,
           2429,  2000,  4481,  2013,  1996,  2406,  1005,  1055,  2093,  4684,
           9224,  1010,  9339,  2011,  7643,  1011,  2653,  3679,  2484, 15775,
          11488,  1012,   102],
         [  101,  4231, 27137, 13999,  2047,  2489, 27137,  2053, 12816,  3749,
            102,  4231, 27137,  3357,  2039,  2971,  2114,  1996,  2502,  3337,
           2066,  4419,  2100, 27137,  2011, 10449,  1037,  4435,  2047, 26812,
           2489, 27137,  2053, 12816,  2734,  3749,  2007,  1996,  4712,  3642,
           4231, 10790,  1012,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0],
         [  101,  2091,  7292,  2000,  2330,  2005,  2198,  2745,  8482,   102,
           6734,  1010,  2033,  20

In [4]:
from transformers import BertModel


#定义模型
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        #加载预训练模型
        self.pretrained = BertModel.from_pretrained('bert-base-uncased')

        #不训练,不需要计算梯度
        for param in self.pretrained.parameters():
            param.requires_grad_(False)

        self.fc = torch.nn.Linear(768, 2)

    def forward(self, data):
        with torch.no_grad():
            pred = self.pretrained(**data)

        #[b, 768]
        pred = pred.last_hidden_state[:, 0]

        return self.fc(pred)


model = Model()

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

model(data).shape

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


10948.3778


torch.Size([8, 2])

In [5]:
#训练
def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    model.train()
    for i, (data, same) in enumerate(loader):
        same = same.to(device)
        for k in data.keys():
            data[k] = data[k].to(device)

        pred = model(data)

        loss = criterion(pred, same)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()

        if i % 50 == 0:
            pred = pred.argmax(dim=1)
            accuracy = (pred == same).sum().item() / len(same)
            print(i, loss.item(), accuracy)

        if i == 1000:
            break

    torch.save(model.cpu(), 'models/2.训练_合并计算法.model')


train()

0 0.5400965213775635 1.0
50 0.024473240599036217 1.0
100 0.04173409566283226 1.0
150 0.012052012607455254 1.0
200 0.0048147812485694885 1.0
250 0.001603083685040474 1.0
300 0.03160906955599785 1.0
350 0.0007619516691192985 1.0
400 0.007851014845073223 1.0
450 0.012219463475048542 1.0
500 0.014638349413871765 1.0
550 0.053889673203229904 1.0
600 0.0024577314034104347 1.0
650 0.0006059815059415996 1.0
700 0.00053600431419909 1.0
750 0.0004944446263834834 1.0
800 0.001773191150277853 1.0
850 0.0018959895242005587 1.0
900 0.001421680673956871 1.0
950 0.00024391320766881108 1.0
1000 0.002352627459913492 1.0


In [6]:
#测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=16,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=False)

    for i, (data, label) in enumerate(loader_test):
        with torch.no_grad():
            out = model(data)

        out = out.argmax(dim=1)

        correct += (out == label).sum().item()
        total += len(label)

        print(i)

    print(correct / total)


model = torch.load('models/2.训练_合并计算法.model')
test()

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


0
1
2
3
4
5
0.9777777777777777
