In [4]:
#################################################################################
# sbert embedding 테스트 예제
# 출처 : https://towardsdatascience.com/multilingual-text-similarity-matching-using-embedding-f79037459bf2
#
# Optimizer 방법
# 1. normalize_embeddings=True 
# 2. 내적 계산 : util.semantic_search 
#################################################################################

from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('bongsoo/sentencebert_v1.2', device='cpu', cache_folder='./1')


Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/118 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/622M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.44M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

In [5]:
corpus = [
    'I am a boy',
    'What are you doing?',
    'Can you help me?',
    'A man is riding a horse.',
    'A woman is playing violin.',
    'A monkey is chasing after a goat',
    'The quick brown fox jumps over the lazy dog'
]

# Query sentences:
queries = ['I am in need of assistance', '我是男孩子', 'Qué estás haciendo']

In [7]:
corpus_embedding = model.encode(corpus, convert_to_tensor=True, normalize_embeddings=True)

top_k = min(5, len(corpus))

for query in queries:
    
    # normalize_embeddings=True 
    query_embedding = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
    
    # util.semantic_search 로 내적 계산함.
    hits = util.semantic_search(query_embedding, corpus_embedding, score_function=util.dot_score)
    hits = hits[0]

    print("\r\nQuery:", query)
    print("---------------------------")
    for hit in hits[:top_k]:
        print(f"{round(hit['score'], 3)} | {corpus[hit['corpus_id']]}")


Query: I am in need of assistance
---------------------------
0.737 | Can you help me?
0.454 | I am a boy
0.211 | What are you doing?
0.088 | A monkey is chasing after a goat
0.072 | A woman is playing violin.

Query: 我是男孩子
---------------------------
0.338 | I am a boy
0.094 | A man is riding a horse.
0.02 | A monkey is chasing after a goat
-0.039 | The quick brown fox jumps over the lazy dog
-0.046 | What are you doing?

Query: Qué estás haciendo
---------------------------
0.199 | A man is riding a horse.
0.155 | A woman is playing violin.
0.115 | A monkey is chasing after a goat
0.101 | The quick brown fox jumps over the lazy dog
0.076 | Can you help me?
