In [None]:
# 学習可能な事前学習済みモデルロード
import torch
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

num_labels = 6 # 感情数
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

In [None]:
# トークナイザー
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# 入力形式
## emotionsデータ
from datasets import load_dataset
emotions = load_dataset("dair-ai/emotion")

## トークン化
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

## 入力形式変更
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"]) # 各ラベルのテンソルがどういったものかが分かる

In [None]:
# 性能評価関数
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted")}

In [None]:
# huggingface ログイン(write権限トークン)
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# 学習パラメータ
from transformers import Trainer, TrainingArguments

batch_size = 64
# logging_steps = len(emotions_encoded["train"]) # 16000
logging_steps = 250
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error",
                                  report_to='none')

In [None]:
# Trainer で学習
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()

In [None]:
# 混同行列のプロット
## 検証データセットの予測
preds_output = trainer.predict(emotions_encoded["validation"])
print(preds_output.metrics) # accuracy と f1 を含むメトリクス

## 正解ラベル
import numpy as np
y_valid = np.array(emotions_encoded["validation"]["label"])
labels = emotions["train"].features["label"].names

## 混同行列プロット関数
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

## ファインチューニングの混同行列
y_preds = np.argmax(preds_output.predictions, axis=1)
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
# 予測ラベルと損失の取得
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
  ## 入力テンソルをデバイスへ
  inputs = {k:v.to(device) for k,v in batch.items()
            if k in tokenizer.model_input_names}

  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits, axis=-1)
    loss = cross_entropy(output.logits, batch["label"].to(device),
                          reduction="none")
  ## CPU で返す
  return {"loss": loss.cpu().numpy(),
          "predicted_label": pred_label.cpu().numpy()}

In [None]:
# DataFrame 作成
emotions_encoded["validation"]  = emotions_encoded["validation"].map(forward_pass_with_label, batched=True, batch_size=16)

## ラベルを文字で表記
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)

## DataFrame 化
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str) # 正解ラベル
df_test["predicted_label"] = (df_test["predicted_label"].apply(label_int2str)) # 予測ラベル

In [None]:
# Loss の大きい順(10件)
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
# loss の小さい順(10件)
df_test.sort_values("loss", ascending=True).head(10)

In [None]:
# モデルのアップロード
trainer.push_to_hub(commit_message="Training completed.")

In [None]:
# アップしたモデルの利用
from transformers import pipeline

model_id = "kirapika2/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

## テスト
custom_tweet = "I saw a movie today and it was really good." # 楽しそうな投稿
preds = classifier(custom_tweet, return_all_scores=True)

## 予測描画
import pandas as pd
preds_df = pd.DataFrame(preds[0])
plt.bar(labels, 100*preds_df["score"], color='C0')
plt.title(f'"{custom_tweet}"')
plt.ylabel("Class probability (%)")
plt.show()

In [None]:
# TensorFlow を使用するためバージョンを変更(新たなセッションで実行し、最初の3セルを次に実行)
## バージョン確認
import transformers
import tensorflow as tf
print(f"transformers version: {transformers.__version__}")
print(f"tensorflow version: {tf.__version__}")

## trainsformers==4.49.0 に (http://reddit.com/r/cs50/comments/1mr1ef6/help_with_tensorflow_and_huggingface_transformers/)
!pip uninstall -y transformers
!pip install transformers==4.49.0

In [None]:
# Keras を使ったファインチューニング
from transformers import TFAutoModelForSequenceClassification

## モデルロード
tf_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_ckpt, num_labels=num_labels))

## データセットの TensorFlow 化
tokenizer_columns = tokenizer.model_input_names
batch_size = 64
tf_train_dataset = emotions_encoded["train"].to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=True,
    batch_size=batch_size
)
tf_validation_dataset = emotions_encoded["validation"].to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=False,
    batch_size=batch_size
)

## 学習
import tensorflow as tf
tf_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)
tf_model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=2)