In [1]:
!pip install transformers[ja,sentencepiece,torch] > /dev/null

In [2]:
!pip install fugashi unidic_lite > /dev/null

In [3]:
from transformers import pipeline

## 文書分類

In [4]:
text_classification_pipeline = pipeline(
    model = "llm-book/bert-base-japanese-v3-marc_ja"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [5]:
def nega_posi_judge(text: str) -> None:
    """
    テキストを受け取りネガティブかポジティブか判定し、結果を出力する関数

    受け取ったテキストを感情分類モデルに入力し、最も確からしい感情ラベルと
    その確率をパーセンテージで出力する。

    Args:
        text (str): 判定したいテキスト

    Returns:
        None
    """
    result = text_classification_pipeline(text)
    print(f"このテキストは{result[0]['score']*100:.1f}%の確率で{result[0]['label']}な文章です")

In [6]:
posi_text = "世界には言葉が分からなくても感動する音楽がある。"

nega_posi_judge(posi_text)

このテキストは99.9%の確率でpositiveな文章です


In [7]:
nega_text = "世界には言葉が出ないほどひどい音楽がある。"

nega_posi_judge(nega_text)

このテキストは98.6%の確率でnegativeな文章です


## 自然言語推論

In [8]:
nli_pipeline = pipeline(model="llm-book/bert-base-japanese-v3-jnli")
text = "2人の男性がジェット機を見ています。"
entailment_text = "ジェット機を見ている2人がいます。"

# textとentailment_textの論理関係を予測
print(nli_pipeline({"text": text, "text_pair": entailment_text}))

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


{'label': 'entailment', 'score': 0.9978838562965393}


In [9]:
contradiction_text = "2人の男性が飛んでいます"

print(nli_pipeline({"text": text, "text_pair": contradiction_text}))

{'label': 'contradiction', 'score': 0.9987305998802185}


In [10]:
neutral_text = "2人の男性が、白い飛行機を眺めています。"

print(nli_pipeline({"text": text, "text_pair": neutral_text}))

{'label': 'neutral', 'score': 0.9974888563156128}


## 意味的類似度計算

In [11]:
text_sim_pipeline = pipeline(
    model = "llm-book/bert-base-japanese-v3-jsts",
    function_to_apply="none",
)

config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [22]:
text = "川べりでサーフボードを持った人たちがいます"
sim_text = "サーファーたちが川べりに立っています"

In [13]:
# textとsim_textの類似度計算
result = text_sim_pipeline({"text": text, "text_pair": sim_text})
print(result["score"])

3.5703558921813965


In [14]:
dissim_text = "トイレの壁に黒いタオルがかけられています"

# textとdissim_textの類似度計算
result = text_sim_pipeline({"text": text, "text_pair": dissim_text})
print(result["score"])

0.041621800512075424


In [15]:
from torch.nn.functional import cosine_similarity

sim_enc_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-unsup-simcse-jawiki",
    task="feature-extraction",
)

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [26]:
# textとsim_textのベクトルを取得
text_emb = sim_enc_pipeline(text, return_tensors=True)[0][0]
sim_emb = sim_enc_pipeline(sim_text, return_tensors=True)[0][0]

In [29]:
# textとsim_textの類似度を計算
sim_pair_score = cosine_similarity(text_emb, sim_emb,dim=0)
print(sim_pair_score.item())

0.8568590879440308


In [36]:
# dissim_textのベクトルを取得
dissim_emb = sim_enc_pipeline(dissim_text, return_tensors=True)[0][0]

In [39]:
# textとdissim_textの類似度を計算
dissim_pair_score = cosine_similarity(text_emb, dissim_emb, dim=0)
print(dissim_pair_score.item())

0.458870530128479
