In [None]:
# 事前学習済みモデルのロード
from transformers import AutoModel
import torch

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# テキストのテンソル化
text = "this is a test"

## トークナイザー
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

In [None]:
# モデルに入れて出力
inputs = {k:v.to(device) for k,v in inputs.items()} # 各テンソル(v)をデバイスに移す
with torch.no_grad():
  outputs = model(**inputs) # inputsをそれぞれ入力(アンパック)
print(outputs)

In [None]:
# 最後の隠れ状態のサイズ
outputs.last_hidden_state.size()

In [None]:
# [CLS]の隠れ状態
outputs.last_hidden_state[:,0].size()

In [None]:
# 隠れ状態抽出関数
def extract_hidden_states(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
  return {"hidden_states": last_hidden_state[:,0].cpu().numpy()} # [CLS]トークンに対するベクトル

In [None]:
# 入力形式
## emotionsデータ
from datasets import load_dataset
emotions = load_dataset("dair-ai/emotion")

## トークン化
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

## 入力形式変更
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"]) # 各ラベルのテンソルがどういったものかが分かる

In [None]:
# 全ての隠れ状態
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True) # デフォルトのバッチサイズ1000　かなり時間かかる
emotions_hidden["train"].column_names

In [None]:
# 特徴行列作成
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_states"])
X_valid = np.array(emotions_hidden["validation"]["hidden_states"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
# 隠れ状態をUMAPで射影
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

## [0, 1]にスケール
X_scaled = MinMaxScaler().fit_transform(X_train)
## UMAP初期化+fit
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)

## 2次元埋め込みのDF作成
import pandas as pd
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
# カテゴリごとに密度プロット
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names

## プロット
for i, (label, cmap) in enumerate(zip(labels, cmaps)):
  df_emb_sub = df_emb.query(f"label == {i}") # ラベルごとに取り出し
  axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                 gridsize=20, linewidth=(0,))
  axes[i].set_title(label)
  axes[i].set_xticks([]), axes[i].set_yticks([])

## 描画
plt.tight_layout()
plt.show()

In [None]:
# ロジスティック回帰で分類モデル学習
from sklearn.linear_model import LogisticRegression

## 確実に収束させるためmax_iterをデフォルト(100)より多く設定
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid) # 正解率：0.6335

In [None]:
# ベースライン測定
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent") # 多数派を選択
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid) # 正解率：0.352

In [None]:
# 混同行列描画
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)