In [1]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

#加载字典和分词工具
token = AutoTokenizer.from_pretrained(checkpoint)

token

2022-12-12 11:46:09.706774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-12 11:46:09.897089: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-12 11:46:09.897124: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-12 11:46:10.907891: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-1

PreTrainedTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [2]:
import torch
from datasets import load_dataset


#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self, split):
        self.dataset = load_dataset(path='embedding-data/sentence-compression',
                                    split='train')

        self.split = split

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        s1, s2 = self.dataset[i]['set']

        if self.split == 'train':
            return s1

        return s2


dataset = Dataset('train')

len(dataset), dataset[0]

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


(180000,
 "The USHL completed an expansion draft on Monday as 10 players who were on the rosters of USHL teams during the 2009-10 season were selected by the League's two newest entries, the Muskegon Lumberjacks and Dubuque Fighting Saints.")

In [3]:
def collate_fn(data):
    #编码
    return token.batch_encode_plus(data,
                                   truncation=True,
                                   padding=True,
                                   max_length=500,
                                   return_tensors='pt')


#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=False,
                                     drop_last=False)

for i, data in enumerate(loader):
    break

len(loader), data

(22500,
 {'input_ids': tensor([[     0,    581,   7082,  48599, 140528,    142,  14700,     66,   6889,
          144888,     98,  68318,    237,    209,  92865,   2750,   3542,     98,
              70,  52470,   1314,    111,   7082,  48599,  87199,  20271,     70,
            1877,   9193,  34003,   3542, 133291,    390,     70,  19175,     25,
               7,   6626,   3525,    525, 112820,     90,      4,     70, 128705,
            3081,     19,   3350,  26278, 135758,      7,    136,    786,    978,
             944, 106313,    214,  12190,      7,      5,      2],
         [     0,  49953,  19175,  44978,  12126,  63871,     56,  35519,    503,
            1746,   1221,    186, 142146,     99,   2907,      5, 208580,  29693,
           11737,  31150,      5,      2,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
         

In [4]:
from transformers import AutoModel

pretrained = AutoModel.from_pretrained(checkpoint)

#不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

pretrained.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(250037, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [5]:
def get_feature(data):
    with torch.no_grad():
        #[b, L, 384]
        feature = pretrained(**data)['last_hidden_state']

    #[b, L]
    attention_mask = data['attention_mask']

    #pad位置的feature是0
    #[b, L, 384] * [b, L, 1] -> [b, L, 384]
    feature *= attention_mask.unsqueeze(dim=2)

    #所有词的feature求和
    #[b, L, 384] -> [b, 384]
    feature = feature.sum(dim=1)

    #求和后的feature除以句子的长度
    #[b, L] -> [b, 1]
    attention_mask = attention_mask.sum(dim=1, keepdim=True)

    #[b, 384] / [b, 1] -> [b, 384]
    feature /= attention_mask.clamp(min=1e-8)

    return feature


get_feature(data).shape

torch.Size([8, 384])

In [6]:
#构建知识矩阵
def build_features():
    global pretrained
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    pretrained.to(device)

    features = []
    for i, data in enumerate(loader):
        for k in data.keys():
            data[k] = data[k].to(device)

        features.append(get_feature(data))

        if i % 50 == 0:
            print(i)

        if i == 1000:
            break

    pretrained.cpu()

    features = torch.cat(features)

    torch.save(features.cpu(), 'models/1.应用_cosin方法.pt')


build_features()

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000


In [7]:
def test_cosin():
    a = torch.FloatTensor([3, 3])
    b = torch.FloatTensor([9, 0])

    print(a, b)

    #内置函数算cosin
    print(torch.nn.functional.cosine_similarity(a, b, dim=0).item())

    import math

    #2*pi是一个圆周,这里是1/8个圆,所以是(1/4)*pi
    print(math.cos((1 / 4) * math.pi))

    #等价
    print(a.pow(2).sum().sqrt(), a.norm(2))

    #另一种计算cosin的公式
    print(a.matmul(b) / a.norm(2) / b.norm(2))


test_cosin()

tensor([3., 3.]) tensor([9., 0.])
0.7071068286895752
0.7071067811865476
tensor(4.2426) tensor(4.2426)
tensor(0.7071)


In [8]:
def test_cosin():
    #先定义两个矩阵,a是知识库,b是新问题
    a = torch.randn(5, 12)
    b = torch.randn(1, 12)

    #以循环的方式分别求cosin
    for i in range(a.shape[0]):
        cos = b[0].matmul(a[i].T) / b[0].norm(p=2) / a[i].norm(p=2)
        print(i, cos.item())

    #矩阵方式计算
    cosin = b.matmul(a.T) / b.norm(p=2, dim=1, keepdim=True) / a.norm(
        p=2, dim=1, keepdim=True).T
    print(cosin)

    #用内置函数算
    for i in range(a.shape[0]):
        cos = torch.nn.functional.cosine_similarity(b[0], a[i], dim=0)
        print(i, cos.item())

    print(torch.nn.functional.cosine_similarity(b, a, dim=1))


test_cosin()

0 -0.18390977382659912
1 -0.6175824403762817
2 -0.047917064279317856
3 -0.2850889265537262
4 -0.11865001916885376
tensor([[-0.1839, -0.6176, -0.0479, -0.2851, -0.1187]])
0 -0.18390975892543793
1 -0.6175823211669922
2 -0.04791706055402756
3 -0.2850889265537262
4 -0.11865001171827316
tensor([-0.1839, -0.6176, -0.0479, -0.2851, -0.1187])


  cos = b[0].matmul(a[i].T) / b[0].norm(p=2) / a[i].norm(p=2)


In [9]:
#测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=1,
                                              collate_fn=collate_fn,
                                              shuffle=False,
                                              drop_last=False)

    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        feature = get_feature(data)

        score = torch.nn.functional.cosine_similarity(feature, features, dim=1)

        argmax = score.argmax().item()

        if i == argmax:
            correct += 1
        total += 1

        if i % (50 * 8) == 0:
            print(i, correct / total)

        if i == 1000 * 8:
            break

    print(correct / total)


features = torch.load('models/1.应用_cosin方法.pt')

test()

Using custom data configuration embedding-data--sentence-compression-d643585deb6e0073
Found cached dataset json (/root/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


0 1.0
400 0.9925187032418953
800 0.9875156054931336
1200 0.9850124895920067
1600 0.9837601499063086
2000 0.9830084957521239
2400 0.9820907955018742
2800 0.982506247768654
3200 0.9831302717900656
3600 0.9836156623160234
4000 0.9840039990002499
4400 0.984776187230175
4800 0.9852114142886899
5200 0.9857719669294367
5600 0.9858953758257454
6000 0.9856690551574737
6400 0.9851585689735979
6800 0.984561093956771
7200 0.9845854742396889
7600 0.9847388501512959
8000 0.9843769528808899
0.9843769528808899
