## Sentence Transformer
* https://huggingface.co/jhgan/ko-sroberta-multitask
* https://github.com/jhgan00/ko-sentence-transformers

In [1]:
import torch
from sentence_transformers import SentenceTransformer

device = torch.device('mps') if torch.cuda.is_available() else torch.device('cpu')
sbert_model_name = "jhgan/ko-sroberta-multitask"
model = SentenceTransformer(sbert_model_name).to(device=device)

sentences = ["안녕하세요?", "한국어 문장 임베딩을 위한 버트 모델입니다."]
embedding = model.encode(sentences, convert_to_numpy=True)

embedding

array([[-0.37510464, -0.7733839 ,  0.5927711 , ...,  0.57923526,
         0.32683483, -0.6508965 ],
       [-0.09361704, -0.18191524, -0.19230816, ..., -0.03165802,
         0.30412534, -0.2679362 ]], dtype=float32)

## 유사 문장 찾기

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('jhgan/ko-sroberta-multitask')
movie_names = pd.read_csv('./data/movie.csv', names=['name'], sep='\t')['name'].tolist()
embeddings = model.encode(movie_names, convert_to_tensor=True)

scores = util.pytorch_cos_sim(model.encode("나쁜 녀석들", convert_to_tensor=True), embeddings)[0]
scores

tensor([0.1600, 0.1835, 0.2892, 0.0627, 0.3318, 0.1335, 0.1501, 0.3141, 0.2155,
        0.0364, 0.1900, 0.3222, 0.3558, 0.3428, 0.1942, 0.1193, 0.2569, 0.1801,
        0.2897, 0.1749, 0.0476, 0.0895, 0.3356, 0.2260, 0.1947, 0.1042, 0.1944,
        0.0411, 0.2324, 0.2764, 0.1599, 0.1944, 0.2797, 0.1521, 0.2806, 0.2186,
        0.3783, 0.2336, 0.1732, 0.1611, 0.2024, 0.1522, 0.2134, 0.1954, 0.0433,
        0.2006, 0.1733, 0.2128, 0.1466, 0.2357, 0.2471, 0.2059, 0.1842, 0.0597,
        0.2681, 0.2321, 0.2316, 0.0739, 0.4056, 0.2771, 0.2856, 0.4977, 0.0873,
        0.1877, 0.1699, 0.1657, 0.2298, 0.1022, 0.3278, 0.1901, 0.2196, 0.1000,
        0.1798, 0.0884, 0.2957, 0.2371, 0.1625, 0.2254, 0.2530, 0.2859, 0.2297,
        0.2970, 0.2924, 0.1152, 0.2428, 0.2435, 0.1649, 0.1744, 0.0617, 0.1861,
        0.1183, 0.2258, 0.1883, 0.2336, 0.1073, 0.0534, 0.1816, 0.3284, 0.2890,
        0.3274, 0.3278, 0.1417])

In [4]:
import torch

result = torch.topk(scores, k=5)
result

torch.return_types.topk(
values=tensor([0.4977, 0.4056, 0.3783, 0.3558, 0.3428]),
indices=tensor([61, 58, 36, 12, 13]))

In [10]:
data = [[movie_names[idx], score] for score, idx in zip(result[0], result[1])]
pd.DataFrame(data, columns=["name", "score"])

Unnamed: 0,name,score
0,바스터즈: 거친 녀석들,tensor(0.4977)
1,폭력의 역사,tensor(0.4056)
2,엉클 분미,tensor(0.3783)
3,칠드런 오브 맨,tensor(0.3558)
4,액트 오브 킬링,tensor(0.3428)
