<a href="https://colab.research.google.com/github/kmykprn/word-similarity/blob/main/SentenceBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考：
https://qiita.com/sonoisa/items/1df94d0a98cd4f209051


In [1]:
! pip install datasets evaluate transformers[sentencepiece,torch]
! pip install fugashi unidic-lite ipadic

Collecting fugashi
  Using cached fugashi-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
Collecting unidic-lite
  Using cached unidic_lite-1.0.8-py3-none-any.whl
Collecting ipadic
  Using cached ipadic-1.0.0-py3-none-any.whl
Installing collected packages: unidic-lite, ipadic, fugashi
Successfully installed fugashi-1.3.2 ipadic-1.0.0 unidic-lite-1.0.8


# HuggingFaceの基本的な使い方例

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

modelName = "tohoku-nlp/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(modelName)

text = "これはテストテキストです"

token = tokenizer.tokenize(text)
print(token)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

['これ', 'は', 'テスト', 'テキスト', 'です']


# 類義語かどうかを比較

In [3]:
from transformers import BertJapaneseTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest",
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)


# モデルの定義
MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
model = SentenceBertJapanese(MODEL_NAME)

# センテンスの定義
sentences = ["お辞儀をしている男性会社員", "笑い袋", "テクニカルエバンジェリスト（女性）", "戦うAI", "笑う男性（5段階）",
             "お金を見つめてニヤけている男性", "「ありがとう」と言っている人", "定年（女性）", "テクニカルエバンジェリスト（男性）",
             "スタンディングオベーション", '暴走中のAI'
             ]
sentence_vectors = model.encode(sentences)

# query_embeddingを取得
query = ['暴走したAI']
query_embedding = model.encode(query).numpy()

# query_enbedingと, 他のセンテンスの比較
similarities = cosine_similarity(query_embedding, sentence_vectors).flatten()
max_id = np.argmax(similarities)
print(max_id)
print(sentences[max_id])
print(similarities)

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

10
暴走中のAI
[0.05882588 0.32120496 0.04490201 0.41021872 0.04170368 0.07286242
 0.15742645 0.06666032 0.05021613 0.36408943 0.8155005 ]
