<a href="https://colab.research.google.com/github/haru1489248/nlp-100-nock/blob/main/ch10/section_96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 96. プロンプトによる感情分析
事前学習済み言語モデルで感情分析を行いたい。テキストを含むプロンプトを事前学習済み言語モデルに与え、（ファインチューニングは行わずに）テキストのポジネガを予測するという戦略で、SST-2の開発データにおける正解率を測定せよ。



In [1]:
from huggingface_hub import notebook_login
notebook_login()

In [2]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.1.0


In [3]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from tqdm import tqdm
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [24]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_new_tokens = 10
batch_size = 32 if torch.cuda.is_available() else 1
src_path = "/content/drive/MyDrive/SST-2/dev.tsv"

### device_mapとは？
モデルの重みをどこで動かすかを指定する仕組み。自動でcudaかcpuを使用するかを決めてくれる



In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
dev_df = pd.read_csv(src_path, sep="\t")

In [34]:
generation_config = GenerationConfig(
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=False,
    max_new_tokens=max_new_tokens
)

In [8]:
sentences = dev_df["sentence"].tolist()
labels = dev_df["label"].tolist()

In [30]:
print("len sentences:", len(sentences))
print("len labels:", len(labels))
print("len dev df:", len(dev_df))

len sentences: 872
len labels: 872
len dev df: 872


In [None]:
correct = 0
for i in tqdm(range(0, len(sentences), batch_size), total=len(sentences) / batch_size):
  batch_sentences = sentences[i:i+batch_size]
  batch_labels = labels[i:i+batch_size]

  batch_chat_template = []
  for s, label in zip(batch_sentences, batch_labels):
    messages = [
        {
            "role": "system",
            "content": """
            You are a classification model for the sentiment analyzer.
            Answer with exactly one word: positive or negative.
            Do not output anything else.
            For example, the positive sentence 'The movie was full of fan.' is inputted, you should return positive.
            """
        },
        {
            "role": "user",
            "content": s
        }
    ]

    chat_template = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    batch_chat_template.append(chat_template)

  tokenized_batch = tokenizer(
      batch_chat_template,
      padding=True,
      padding_side="left",
      return_tensors="pt"
  )
  batch_input = tokenized_batch["input_ids"].to(device)
  input_len = len(batch_input[0])
  batch_attention_mask = tokenized_batch["attention_mask"].to(device)
  batch_labels = torch.Tensor(batch_labels).to(device)

  with torch.no_grad():
    outputs = model.generate(
        input_ids=batch_input,
        attention_mask=batch_attention_mask,
        generation_config=generation_config,
    )
  for output, true_label in zip(outputs, batch_labels):
    response = tokenizer.decode(output[input_len:], skip_special_tokens=True)
    if "positive" in response:
      pred_label = 1
    elif "negative" in response:
      pred_label = 0
    else:
      pred_label = -1

    if pred_label == true_label:
      correct += 1

  accuracy = correct / len(sentences)

  print(f"Accuracy: {accuracy:.4f} ({correct} / {len(sentences)})")