In [1]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

#加载字典和分词工具
token = AutoTokenizer.from_pretrained(checkpoint)

token

2022-12-12 14:54:07.788358: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 14:54:07.937105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-12 14:54:07.937124: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-12 14:54:08.708706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-1

PreTrainedTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [2]:
import torch
from datasets import load_dataset
import random


#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self, split):
        dataset = load_dataset(path='embedding-data/sentence-compression',
                               split='train')

        self.dataset = dataset.train_test_split(test_size=0.0005)[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        s1, s2 = self.dataset[i]['set']
        same = 1

        #有一半的概率把后半句替换为一句无关的话
        if random.random() > 0.5:
            s2 = random.choice(self.dataset)['set'][1]
            same = 0

        if random.random() > 0.5:
            s1, s2 = s2, s1

        return s1, s2, same


dataset = Dataset('train')

len(dataset), dataset[0]

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


(179910,
 ("Chris Rock convinced someone is planning a ``sting operation''",
  "Chris Rock is convinced someone is planning a ``sting operation'' every time a sexy girl talks to him.",
  1))

In [3]:
def collate_fn(data):
    s1 = [i[0] for i in data]
    s2 = [i[1] for i in data]
    same = [i[2] for i in data]

    #编码
    data1 = token.batch_encode_plus(s1,
                                    truncation=True,
                                    padding=True,
                                    max_length=500,
                                    return_tensors='pt')

    data2 = token.batch_encode_plus(s2,
                                    truncation=True,
                                    padding=True,
                                    max_length=500,
                                    return_tensors='pt')

    same = torch.LongTensor(same)

    return data1, data2, same


#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (data1, data2, same) in enumerate(loader):
    break

len(loader), data1, data2, same

(22488,
 {'input_ids': tensor([[     0,   5959, 135179,     70,  92702,  93905, 148639,   7432,   8394,
           15592,     36,  27751,   3129, 112141,     31,  20051,     98,  17262,
             453,     23,     70,  26349,  11698, 110680,    903,   6602,     83,
            4420,     47,   6863,  10332,   8299,  16037,      5,      2,      1,
               1,      1],
         [     0,  18226,  14202,  43542,     47,   4488,     98,   4620,    214,
          152132,      7,      2,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1],
         [     0,  68804,      7,  10336,    297,    100,     10, 189173,    538,
           92319,  67229,      2,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,     

In [4]:
from transformers import AutoModel


#定义模型
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        #加载预训练模型
        self.pretrained = AutoModel.from_pretrained(checkpoint)

        #不训练,不需要计算梯度
        for param in self.pretrained.parameters():
            param.requires_grad_(False)

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Linear(768, 2),
        )

    def get_feature(self, data):
        with torch.no_grad():
            #[b, L, 384]
            feature = self.pretrained(**data)['last_hidden_state']

        #[b, L]
        attention_mask = data['attention_mask']

        #pad位置的feature是0
        #[b, L, 384] * [b, L, 1] -> [b, L, 384]
        feature *= attention_mask.unsqueeze(dim=2)

        #所有词的feature求和
        #[b, L, 384] -> [b, 384]
        feature = feature.sum(dim=1)

        #求和后的feature除以句子的长度
        #[b, L] -> [b, 1]
        attention_mask = attention_mask.sum(dim=1, keepdim=True)

        #[b, 384] / [b, 1] -> [b, 384]
        feature /= attention_mask.clamp(min=1e-8)

        return feature

    def forward(self, data1, data2):
        feature1 = self.get_feature(data1)
        feature2 = self.get_feature(data2)

        feature = torch.cat([feature1, feature2], dim=1)

        return self.fc(feature)


model = Model()

model(data1, data2).shape

torch.Size([8, 2])

In [5]:
#训练
def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    model.train()
    for i, (data1, data2, same) in enumerate(loader):
        same = same.to(device)
        for k in data1.keys():
            data1[k] = data1[k].to(device)
            data2[k] = data2[k].to(device)
        pred = model(data1, data2)

        loss = criterion(pred, same)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        optimizer.zero_grad()

        if i % 50 == 0:
            pred = pred.argmax(dim=1)
            accuracy = (pred == same).sum().item() / len(same)
            print(i, loss.item(), accuracy)

        if i == 2000:
            break

    torch.save(model.cpu(), 'models/3.训练_分别计算法.model')


train()

0 0.6659839153289795 0.875
50 0.6840968132019043 0.375
100 0.6825852394104004 0.5
150 0.5992887616157532 0.75
200 0.6148892641067505 0.875
250 0.6363856196403503 0.75
300 0.49421191215515137 0.875
350 0.48576438426971436 0.875
400 0.4708303213119507 0.75
450 0.45533663034439087 0.75
500 0.2757263481616974 0.875
550 0.21047598123550415 1.0
600 0.22561021149158478 0.875
650 0.16972705721855164 1.0
700 0.2094234973192215 0.875
750 0.11808156967163086 1.0
800 0.24072951078414917 0.875
850 0.23320215940475464 0.875
900 0.048693425953388214 1.0
950 0.0808393731713295 1.0
1000 0.06140816956758499 1.0
1050 0.025953814387321472 1.0
1100 0.05567248910665512 1.0
1150 0.1256197690963745 1.0
1200 0.13684037327766418 1.0
1250 0.02317812480032444 1.0
1300 0.18239809572696686 0.875
1350 0.12977005541324615 0.875
1400 0.011395378038287163 1.0
1450 0.09584089368581772 1.0
1500 0.4376535415649414 0.875
1550 0.1762787252664566 0.875
1600 0.15225888788700104 0.875
1650 0.1379726231098175 0.875
1700 0.34969

In [6]:
#测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=16,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=False)

    for i, (data1, data2, label) in enumerate(loader_test):
        with torch.no_grad():
            pred = model(data1, data2)

        pred = pred.argmax(dim=1)

        correct += (pred == label).sum().item()
        total += len(label)

        print(i)

    print(correct / total)


model = torch.load('models/3.训练_分别计算法.model')
test()

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


0
1
2
3
4
5
0.9888888888888889
