In [1]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

#加载字典和分词工具
token = AutoTokenizer.from_pretrained(checkpoint)

token

2022-12-12 18:23:54.296961: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 18:23:54.419008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-12 18:23:54.419027: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-12 18:23:55.103408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-1

PreTrainedTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [2]:
import torch
from datasets import load_dataset, concatenate_datasets
import random


#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self, split):
        self.dataset = load_dataset(path='embedding-data/sentence-compression',
                                    split='train')

        self.split = split

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        s1, s2 = self.dataset[i]['set']

        if self.split == 'train':
            return s1

        return s2


dataset = Dataset('train')

len(dataset), dataset[0]

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


(180000,
 "The USHL completed an expansion draft on Monday as 10 players who were on the rosters of USHL teams during the 2009-10 season were selected by the League's two newest entries, the Muskegon Lumberjacks and Dubuque Fighting Saints.")

In [3]:
def collate_fn(data):
    #编码
    return token.batch_encode_plus(data,
                                   truncation=True,
                                   padding=True,
                                   max_length=500,
                                   return_tensors='pt')


#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=False,
                                     drop_last=False)

for i, data in enumerate(loader):
    break

len(loader), data

(22500,
 {'input_ids': tensor([[     0,    581,   7082,  48599, 140528,    142,  14700,     66,   6889,
          144888,     98,  68318,    237,    209,  92865,   2750,   3542,     98,
              70,  52470,   1314,    111,   7082,  48599,  87199,  20271,     70,
            1877,   9193,  34003,   3542, 133291,    390,     70,  19175,     25,
               7,   6626,   3525,    525, 112820,     90,      4,     70, 128705,
            3081,     19,   3350,  26278, 135758,      7,    136,    786,    978,
             944, 106313,    214,  12190,      7,      5,      2],
         [     0,  49953,  19175,  44978,  12126,  63871,     56,  35519,    503,
            1746,   1221,    186, 142146,     99,   2907,      5, 208580,  29693,
           11737,  31150,      5,      2,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
         

In [4]:
from transformers import AutoModel


#定义模型
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        #加载预训练模型
        self.pretrained = AutoModel.from_pretrained(checkpoint)

        #不训练,不需要计算梯度
        for param in self.pretrained.parameters():
            param.requires_grad_(False)

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Linear(768, 2),
        )

    def get_feature(self, data):
        with torch.no_grad():
            #[b, L, 384]
            feature = self.pretrained(**data)['last_hidden_state']

        #[b, L]
        attention_mask = data['attention_mask']

        #pad位置的feature是0
        #[b, L, 384] * [b, L, 1] -> [b, L, 384]
        feature *= attention_mask.unsqueeze(dim=2)

        #所有词的feature求和
        #[b, L, 384] -> [b, 384]
        feature = feature.sum(dim=1)

        #求和后的feature除以句子的长度
        #[b, L] -> [b, 1]
        attention_mask = attention_mask.sum(dim=1, keepdim=True)

        #[b, 384] / [b, 1] -> [b, 384]
        feature /= attention_mask.clamp(min=1e-8)

        return feature

    def forward(self, data1, data2):
        feature1 = self.get_feature(data1)
        feature2 = self.get_feature(data2)

        feature = torch.cat([feature1, feature2], dim=1)

        return self.fc(feature)


model = torch.load('models/3.训练_分别计算法.model')

model.eval()

Model(
  (pretrained): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250037, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)

In [5]:
#构建知识矩阵
def build_features():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    features = []
    for i, data in enumerate(loader):
        for k in data.keys():
            data[k] = data[k].to(device)

        features.append(model.get_feature(data))

        if i % 50 == 0:
            print(i)

        if i == 150:
            break

    model.cpu()

    features = torch.cat(features)

    torch.save(features.cpu(), 'models/4.应用_分别计算法.pt')


build_features()

0
50
100
150


In [6]:
#测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=1,
                                              collate_fn=collate_fn,
                                              shuffle=False,
                                              drop_last=False)

    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        feature = model.get_feature(data)

        feature = feature.repeat(features.shape[0], 1)

        feature = torch.cat([features, feature], dim=1)

        score = model.fc(feature).softmax(dim=1)[:, 1]

        argmax = score.argmax().item()

        if i == argmax:
            correct += 1
        total += 1

        if i % 50 == 0:
            print(i)

        if i == 150 * 8:
            break

    print(correct / total)


features = torch.load('models/4.应用_分别计算法.pt')

test()

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
0.9117402164862615
