In [14]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [29]:
class NeuralNetwork():
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
        self.model = BertModel.from_pretrained('bert-base-chinese')

    def sentence_to_embedding(self, sentence):
        # 对句子进行分词和编码
        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor(input_ids).unsqueeze(0)  # 添加batch维度

        # 获取BERT模型输出的嵌入
        with torch.no_grad():
            outputs = self.model(input_ids)
            embedding = outputs.last_hidden_state.mean(dim=1)  # 对所有词的嵌入取平均作为句子嵌入
        return embedding

    def compute_similarity(self, embedding1, embedding2):
        embedding1_np = embedding1.detach().cpu().numpy()
        embedding2_np = embedding2.detach().cpu().numpy()
        similarity = cosine_similarity(embedding1_np, embedding2_np)[0][0]
        return similarity

    def fit(self, sentence1, sentence2):
        s1 = self.sentence_to_embedding(sentence1)
        s2 = self.sentence_to_embedding(sentence2)
        similarity = self.compute_similarity(s1, s2)
        return "%.4f" % similarity

sentence1 = "葡萄富含百分之九十五的原花青素，和维E协同抗氧化清除自由基，还添加了油茶籽油三效合一抗衰老，百分之三十五的叶绿素九块九毛钱。"
sentence2 = "葡萄籽富含95%的原花青素。和维E协同抗氧化清除自由基。还添加了油茶籽油3效合1抗老。35%的叶绿素9块9毛钱。"

net = NeuralNetwork()
data = net.fit(sentence1, sentence2)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
data

'0.9704'