# 第9章: 事前学習済み言語モデル（BERT型）

本章では、BERT型の事前学習済みモデルを利用して、マスク単語の予測や文ベクトルの計算、評判分析器（ポジネガ分類器）の構築に取り組む。

## 80. トークン化

"The movie was full of incomprehensibilities."という文をトークンに分解し、トークン列を表示せよ。

In [2]:
from transformers import logging, BertTokenizer

# 警告を抑制（重要なエラーだけ表示）
logging.set_verbosity_error()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = "The movie was full of incomprehensibilities."
tokens = tokenizer.tokenize(text)

print(tokens)

['the', 'movie', 'was', 'full', 'of', 'inc', '##omp', '##re', '##hen', '##si', '##bilities', '.']


## 81. マスクの予測

"The movie was full of [MASK]."の"[MASK]"を埋めるのに最も適切なトークンを求めよ。

In [4]:
from transformers import pipeline
from pprint import pprint

# パイプライン作成と予測
unmasker = pipeline("fill-mask", model="bert-base-uncased")
results = unmasker("The movie was full of [MASK].")
pprint(results[0])


{'score': 0.10711909830570221,
 'sequence': 'the movie was full of fun.',
 'token': 4569,
 'token_str': 'fun'}


## 82. マスクのtop-k予測

"The movie was full of [MASK]."の"[MASK]"に埋めるのに適切なトークン上位10個と、その確率（尤度）を求めよ。

In [5]:
from transformers import pipeline

# pipelineを作成（マスク補完用）
unmasker = pipeline("fill-mask", model="bert-base-uncased")

# 入力文（[MASK]は必ず大文字で）
text = "The movie was full of [MASK]."

# top_k=10で上位10個の予測を取得
results = unmasker(text, top_k=10)

# 結果の表示
for i, result in enumerate(results, 1):
    token = result["token_str"]
    score = result["score"]
    print(f"{i}. {token:<15} (probability: {score:.4f})")


1. fun             (probability: 0.1071)
2. surprises       (probability: 0.0663)
3. drama           (probability: 0.0447)
4. stars           (probability: 0.0272)
5. laughs          (probability: 0.0254)
6. action          (probability: 0.0195)
7. excitement      (probability: 0.0190)
8. people          (probability: 0.0183)
9. tension         (probability: 0.0150)
10. music           (probability: 0.0146)


## 83. CLSトークンによる文ベクトル

以下の文の全ての組み合わせに対して、最終層の[CLS]トークンの埋め込みベクトルを用いてコサイン類似度を求めよ。

- "The movie was full of fun."
- "The movie was full of excitement."
- "The movie was full of crap."
- "The movie was full of rubbish."


In [6]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import itertools

# デバイス設定（GPUがあるなら使う）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデルとトークナイザのロード
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)
model.eval()

# 対象の文
sentences = [
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish."
]

# 各文に対する [CLS] トークンの最終層埋め込みを取得
cls_embeddings = []

with torch.no_grad():
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model(**inputs)
        cls_embed = outputs.last_hidden_state[:, 0, :]  # [CLS] トークンは位置 0
        cls_embeddings.append(cls_embed.cpu())

# 埋め込みを1つのテンソルにまとめて numpy 配列に変換
cls_embeddings = torch.cat(cls_embeddings, dim=0).numpy()

# コサイン類似度の計算
similarities = cosine_similarity(cls_embeddings)

# 出力：すべてのペアのコサイン類似度
pairs = list(itertools.combinations(range(len(sentences)), 2))
for i, j in pairs:
    print(f"Similarity between:\n  \"{sentences[i]}\"\n  \"{sentences[j]}\"\n  => {similarities[i][j]:.4f}\n")


Similarity between:
  "The movie was full of fun."
  "The movie was full of excitement."
  => 0.9881

Similarity between:
  "The movie was full of fun."
  "The movie was full of crap."
  => 0.9558

Similarity between:
  "The movie was full of fun."
  "The movie was full of rubbish."
  => 0.9475

Similarity between:
  "The movie was full of excitement."
  "The movie was full of crap."
  => 0.9541

Similarity between:
  "The movie was full of excitement."
  "The movie was full of rubbish."
  => 0.9487

Similarity between:
  "The movie was full of crap."
  "The movie was full of rubbish."
  => 0.9807



## 84. 平均による文ベクトル

以下の文の全ての組み合わせに対して、最終層の埋め込みベクトルの平均を用いてコサイン類似度を求めよ。

- "The movie was full of fun."
- "The movie was full of excitement."
- "The movie was full of crap."
- "The movie was full of rubbish."

In [5]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import itertools

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# モデルとトークナイザのロード
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)
model.eval()

# 対象文
sentences = [
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish."
]

# 各文に対して平均埋め込みを計算
mean_embeddings = []

with torch.no_grad():
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model(**inputs)

        # last_hidden_state: (1, seq_len, hidden_dim)
        token_embeddings = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_dim)
        attention_mask = inputs["attention_mask"].squeeze(0)     # (seq_len)

        # attention_mask を使って、PAD トークンを除外して平均
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=0)
        sum_mask = torch.clamp(mask_expanded.sum(dim=0), min=1e-9)
        mean_embedding = sum_embeddings / sum_mask

        mean_embeddings.append(mean_embedding.cpu())

# numpy に変換してコサイン類似度を計算
mean_embeddings = torch.stack(mean_embeddings).numpy()
similarities = cosine_similarity(mean_embeddings)

# 出力
pairs = list(itertools.combinations(range(len(sentences)), 2))
for i, j in pairs:
    print(f"Similarity between:\n  \"{sentences[i]}\"\n  \"{sentences[j]}\"\n  => {similarities[i][j]:.4f}\n")


Similarity between:
  "The movie was full of fun."
  "The movie was full of excitement."
  => 0.9568

Similarity between:
  "The movie was full of fun."
  "The movie was full of crap."
  => 0.8490

Similarity between:
  "The movie was full of fun."
  "The movie was full of rubbish."
  => 0.8169

Similarity between:
  "The movie was full of excitement."
  "The movie was full of crap."
  => 0.8352

Similarity between:
  "The movie was full of excitement."
  "The movie was full of rubbish."
  => 0.7938

Similarity between:
  "The movie was full of crap."
  "The movie was full of rubbish."
  => 0.9226



## 85. データセットの準備

[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) ベンチマークで配布されている[Stanford Sentiment Treebank (SST)](https://dl.fbaipublicfiles.com/glue/data/SST-2.zip) から訓練セット（train.tsv）と開発セット（dev.tsv）のテキストと極性ラベルと読み込み、さらに全てのテキストはトークン列に変換せよ。

In [2]:
!wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
!unzip SST-2.zip

--2025-05-23 04:43:12--  https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 108.157.254.124, 108.157.254.15, 108.157.254.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|108.157.254.124|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7439277 (7.1M) [application/zip]
Saving to: ‘SST-2.zip’


2025-05-23 04:43:14 (5.53 MB/s) - ‘SST-2.zip’ saved [7439277/7439277]

Archive:  SST-2.zip
   creating: SST-2/
  inflating: SST-2/dev.tsv           
   creating: SST-2/original/
  inflating: SST-2/original/README.txt  
  inflating: SST-2/original/SOStr.txt  
  inflating: SST-2/original/STree.txt  
  inflating: SST-2/original/datasetSentences.txt  
  inflating: SST-2/original/datasetSplit.txt  
  inflating: SST-2/original/dictionary.txt  
  inflating: SST-2/original/original_rt_snippets.txt  
  inflating: SST-2/original/sentiment_labels.txt  
  inflating: SST-2/test.tsv          
  inflating: 

In [6]:
import pandas as pd

train_data = pd.read_csv('SST-2/train.tsv', sep='\t')
dev_data = pd.read_csv('SST-2/dev.tsv', sep='\t')

train_data1 = []
for _,j in train_data.iterrows():
  tokens = tokenizer.tokenize(j["sentence"])
  data = {"sentence":tokens,"label":j["label"]}
  train_data1.append(data)

dev_data1 = []
for _,j in dev_data.iterrows():
  tokens = tokenizer.tokenize(j["sentence"])
  data = {"sentence":tokens,"label":j["label"]}
  dev_data1.append(data)

## 86. ミニバッチの作成

85で読み込んだ訓練データの一部（例えば冒頭の4事例）に対して、パディングなどの処理を行い、トークン列の長さを揃えてミニバッチを構成せよ。

In [11]:
# 86の解答：冒頭4事例に対してパディングを行い、ミニバッチを構成する

from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence

# トークナイザーの準備（BERTを例とする）
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 冒頭の4事例だけを対象にする
train_data_subset = train_data1[:4]

input_ids = []
attention_masks = []
labels = []

for i in train_data_subset:
    # トークナイズしてID化、特殊トークン追加、attention mask生成
    encoded = tokenizer.encode_plus(
        " ".join(i['sentence']),
        add_special_tokens=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'].squeeze(0))
    attention_masks.append(encoded['attention_mask'].squeeze(0))
    labels.append(int(i['label']))

# パディングしてバッチ化（最大長に揃える）
input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
labels_tensor = torch.tensor(labels)

# 結果を表示
print("Input IDs (padded):")
print(input_ids_padded)
print("\nAttention Masks:")
print(attention_masks_padded)
print("\nLabels:")
print(labels_tensor)


Input IDs (padded):
tensor([[  101,  5342,  2047,  3595,  1001,  1001, 15956,  2013,  1996, 18643,
          3197,   102,     0,     0,     0,     0,     0],
        [  101,  3397,  2053, 15966,  1010,  2069,  4450,  1001,  1001,  3968,
         18201,  1001,  1001,  1055,   102,     0,     0],
        [  101,  2008,  7459,  2049,  3494,  1998, 10639,  1001,  1001,  1055,
          2242,  2738,  3376,  2055,  2529,  3267,   102],
        [  101,  3464, 12580,  8510,  2000,  3961,  1996,  2168,  2802,   102,
             0,     0,     0,     0,     0,     0,     0]])

Attention Masks:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

Labels:
tensor([0, 0, 1, 0])


## 87. ファインチューニング

訓練セットを用い、事前学習済みモデルを極性分析タスク向けにファインチューニングせよ。検証セット上でファインチューニングされたモデルの正解率を計測せよ。

In [9]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 軽量モデルとトークナイザー
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

class SST2Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.encodings = tokenizer(
            df["sentence"].tolist(),
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        self.labels = torch.tensor(df["label"].tolist())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()} | {"labels": self.labels[idx]}

train_dataset = SST2Dataset(train_data, tokenizer)
dev_dataset = SST2Dataset(dev_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
dev_loader = DataLoader(dev_dataset, batch_size=16, num_workers=0)

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 1  # エポック数10に変更
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")

    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            targets.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(targets, preds)
    print(f"Epoch {epoch+1} Validation Accuracy: {acc:.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 4210/4210 [11:51<00:00,  5.91it/s]


Epoch 1 Average Loss: 0.2146
Epoch 1 Validation Accuracy: 0.8979


## 88. 極性分析

問題87でファインチューニングされたモデルを用いて、以下の文の極性を予測せよ。

- "The movie was full of incomprehensibilities."
- "The movie was full of fun."
- "The movie was full of excitement."
- "The movie was full of crap."
- "The movie was full of rubbish."


In [10]:
# 推論対象の文
sentences = [
    "The movie was full of incomprehensibilities.",
    "The movie was full of fun.",
    "The movie was full of excitement.",
    "The movie was full of crap.",
    "The movie was full of rubbish."
]

# モデルを評価モードに
model.eval()

# 各文に対して推論
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=1).item()

    label_str = "positive" if predicted_class_id == 1 else "negative"
    print(f"\"{sentence}\" => {label_str}")


"The movie was full of incomprehensibilities." => negative
"The movie was full of fun." => positive
"The movie was full of excitement." => positive
"The movie was full of crap." => negative
"The movie was full of rubbish." => negative


## 89. アーキテクチャの変更

問題87とは異なるアーキテクチャ（例えば[CLS]トークンを用いるか、各トークンの最大値プーリングを用いるなど）の分類モデルを設計し、事前学習済みモデルを極性分析タスク向けにファインチューニングせよ。検証セット上でファインチューニングされたモデルの正解率を計測せよ。

In [11]:
from torch import nn

# DistilBERT 本体 + 自作の出力層に置き換え（CLSトークン明示使用）
from transformers import DistilBertModel

class DistilBertClassifier(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased", num_labels=2):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # CLSトークン
        logits = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {'loss': loss, 'logits': logits}


In [13]:
# モデル初期化
model = DistilBertClassifier().to(device)

# 残りは以前のコードと同様：
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 1

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Average Loss: {total_loss / len(train_loader):.4f}")

    # 評価
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs["logits"]
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            targets.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(targets, preds)
    print(f"Epoch {epoch+1} Validation Accuracy: {acc:.4f}")


Epoch 1: 100%|██████████| 4210/4210 [11:50<00:00,  5.93it/s]


Epoch 1 Average Loss: 0.2109
Epoch 1 Validation Accuracy: 0.9002
