In [None]:
%pip install -U sentence-transformers

## Quickstart

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

## Comparing Sentence Similarities

The sentences (texts) are mapped such that sentences with similar meanings are close in vector space. One common method to measure the similarity in vector space is to use cosine similarity. For two sentences, this can be done like this:

句子（文本）被映射，使得具有相似含义的句子在向量空间中接近。测量向量空间相似度的一种常见方法是使用余弦相似度。


### Model : all-MiniLM-L6-v2

In [2]:
from sentence_transformers import SentenceTransformer, util


#embedding_model = SentenceTransformer('all-MiniLM-L6-v2') 
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
#embedding_model = SentenceTransformer('BAAI/bge-large-zh')  # 北京智源人工智能研究院（BAAI）开源BGE模型



wordpairs = [
    ['齐默曼回答说：“你在这里做什么？”','马丁说：“你为什么跟踪我？”'],
    ['那只棕色的大狗在高高的草地上跳来跳去。','那个穿着蓝色衬衫的女孩正在穿过一个科学中心。'],
    ['世界上的每一个人都达到了墨西哥目前的繁荣水平。','世界上的每一个人都处在目前西北非洲的“繁荣”水平。	'],
    ['巴勒斯坦和约旦协调和平谈判立场','哈马斯高级否认加沙，巴权力机构协调和谈	'],
    ['许多人参加了自行车比赛，包括一名骑三轮自行车的人。','一个坐着三轮椅的人。'],
    ['海啸警报：南澳无需恐慌','亚齐地震后的海啸警报	'],
    ['抓着金属门的鸟。','一只五颜六色的鸟依附在铁丝网上。'],
    ['锡箔帽为狗脸，你穿多大的尺寸？','锡箔帽给萨格，你穿多大尺寸的？'],
    ['阿肯色州最高法院否决行刑法','阿肯色州法官废除死刑'],
    ['一只松鼠在转圈。','一只松鼠绕着圈跑。'],
    ['一列玩具火车撞上了一辆玩具汽车。','一辆玩具汽车撞上了一列玩具火车。'],
    ['一个人走下楼梯。','一个男人走下楼梯。'],
]

# 计算两个句子的相似度（余弦相似度） 
dispFormat = "{:30}\t{:40}\t{:10}"
for wordpair in wordpairs:
    embeddings = embedding_model.encode(wordpair)
    print(dispFormat.format(wordpair[0] , wordpair[1] , str(util.cos_sim(embeddings[0], embeddings[1])))) 


  from .autonotebook import tqdm as notebook_tqdm
modules.json: 100%|██████████| 341/341 [00:00<00:00, 828kB/s]
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 503kB/s]
README.md: 100%|██████████| 2.69k/2.69k [00:00<00:00, 21.0MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 156kB/s]
config.json: 100%|██████████| 610/610 [00:00<00:00, 2.44MB/s]
pytorch_model.bin:  70%|███████   | 377M/539M [16:06:14<04:17, 628kB/s]

all-MiniLM-L6-v2:
    老师vs教师distance = tensor([[1.0000]])
    老师vs泰国distance = tensor([[0.4360]])
    教师vs泰国distance = tensor([[0.4360]])
    商品vs货物distance = tensor([[1.0000]])
    商品vs跑步distance = tensor([[1.0000]])
    货物vs跑步distance = tensor([[1.0000]])

distiluse-base-multilingual-cased-v2:
    老师vs教师distance = tensor([[0.9646]])
    老师vs泰国distance = tensor([[0.3492]])
    教师vs泰国distance = tensor([[0.3385]])
    商品vs货物distance = tensor([[0.9299]])
    商品vs跑步distance = tensor([[0.4246]])
    货物vs跑步distance = tensor([[0.4699]])

BAAI/bge-large-zh: 
    老师vs教师distance = tensor([[0.9429]])
    老师vs泰国distance = tensor([[0.7814]])
    教师vs泰国distance = tensor([[0.7715]])
    商品vs货物distance = tensor([[0.9424]])
    商品vs跑步distance = tensor([[0.7904]])
    货物vs跑步distance = tensor([[0.7937]])

If you have a list with more sentences, you can use the following code example:

In [41]:
from sentence_transformers import SentenceTransformer, util 
#model = SentenceTransformer('all-MiniLM-L6-v2') 
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
#model = SentenceTransformer('BAAI/bge-large-zh')  # 北京智源人工智能研究院（BAAI）开源BGE模型

sentences = [
    '齐默曼回答说：“你在这里做什么？”','马丁说：“你为什么跟踪我？”',
    '那只棕色的大狗在高高的草地上跳来跳去。','那个穿着蓝色衬衫的女孩正在穿过一个科学中心。',
    '世界上的每一个人都达到了墨西哥目前的繁荣水平。','世界上的每一个人都处在目前西北非洲的“繁荣”水平。',
    '巴勒斯坦和约旦协调和平谈判立场','哈马斯高级否认加沙，巴权力机构协调和谈',
    '许多人参加了自行车比赛，包括一名骑三轮自行车的人。','一个坐着三轮椅的人。',
    '海啸警报：南澳无需恐慌','亚齐地震后的海啸警报	',
    '抓着金属门的鸟。','一只五颜六色的鸟依附在铁丝网上。',
    '锡箔帽为狗脸，你穿多大的尺寸？','锡箔帽给萨格，你穿多大尺寸的？',
    '阿肯色州最高法院否决行刑法','阿肯色州法官废除死刑',
    '一只松鼠在转圈。','一只松鼠绕着圈跑。',
    '一列玩具火车撞上了一辆玩具汽车。','一辆玩具汽车撞上了一列玩具火车。',
    '一个人走下楼梯。','一个男人走下楼梯。',
          ]

#Encode all sentences
embeddings = model.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{:<40} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))
   

Top-5 most similar pairs:
一列玩具火车撞上了一辆玩具汽车。                         	 一辆玩具汽车撞上了一列玩具火车。 	 0.9967
一个人走下楼梯。                                 	 一个男人走下楼梯。 	 0.9327
一只松鼠在转圈。                                 	 一只松鼠绕着圈跑。 	 0.8788
阿肯色州最高法院否决行刑法                            	 阿肯色州法官废除死刑 	 0.8278
锡箔帽为狗脸，你穿多大的尺寸？                          	 锡箔帽给萨格，你穿多大尺寸的？ 	 0.8100


tensor([[ 1.0000e+00,  3.7163e-01,  3.3804e-02,  1.6190e-02,  2.2602e-02,
          1.0032e-01,  2.7288e-02,  7.1185e-02,  4.0785e-02,  4.8927e-02,
          8.2131e-04, -5.6788e-03,  5.2471e-02,  6.6998e-03,  1.0468e-01,
          1.7129e-01, -1.4580e-02,  5.9282e-03,  4.7486e-02,  8.0303e-02,
         -4.4263e-02, -5.0131e-02,  5.5499e-02,  9.8568e-02],
        [ 3.7163e-01,  1.0000e+00,  2.1997e-02,  2.7764e-02, -1.4818e-03,
          4.1009e-02,  1.0506e-02, -2.5539e-02,  4.0452e-02, -1.0735e-02,
          1.2877e-02,  8.8788e-04,  5.7352e-02, -2.7651e-02,  5.3040e-02,
          6.8118e-02,  1.4428e-02,  1.5167e-02,  1.7814e-02,  6.6489e-02,
          7.3695e-02,  7.5180e-02,  1.0353e-01,  1.3578e-01],
        [ 3.3804e-02,  2.1997e-02,  1.0000e+00,  1.0319e-01,  1.0936e-01,
          9.3686e-03, -8.2103e-02,  7.8682e-02,  5.9156e-02,  5.7821e-03,
          3.4748e-02,  3.4636e-02,  8.0183e-02,  1.7397e-01,  2.8483e-01,
          1.0664e-01,  9.1437e-02,  1.3178e-02,  1.4878e-01,  