In [1]:
# 必要なライブラリのインポート

import numpy as np  # 数値計算用ライブラリ
import pandas as pd  # データ操作用ライブラリ
import torch  # PyTorchのインポート
from sklearn.model_selection import train_test_split  # データ分割のための関数
from torch.utils.data import (
    DataLoader,
    Dataset,
)  # データローダーとデータセットのインポート
from tqdm import tqdm  # プログレスバーの表示
from transformers import (  # Transformersライブラリからのモデルとトークナイザのインポート
    AdamW,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# データの読み込み
df = pd.read_csv("training_data_without_nan.tsv", sep="\t")

# ラベルとテキストの抽出
df["label"] = (
    df["ラベル"].astype("category").cat.codes
)  # ラベルをカテゴリから数値に変換
labels = df["ラベル"].astype("category").cat.categories.tolist()  # カテゴリリストを取得
df["text"] = df["文章"]  # テキストデータを抽出
df["satisfaction"] = df["満足度"]  # 満足度データを抽出

# トレーニングデータとテストデータに分割
train_df, test_df = train_test_split(df, test_size=0.2)  # データを8:2に分割

In [3]:
# データセットクラスの定義
class TextDataset(Dataset):
    def __init__(
        self, df: pd.DataFrame, tokenizer: BertTokenizer, max_length: int = 128
    ):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict:
        text = self.df.iloc[idx]["text"]
        label = self.df.iloc[idx]["label"]
        satisfaction = self.df.iloc[idx]["satisfaction"]

        # トークナイズ
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        )

        # 辞書形式でデータを返す
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
            "satisfaction": torch.tensor(satisfaction, dtype=torch.float),
        }


# パディングを処理するcollate_fnの定義
def collate_fn(batch: list) -> dict:
    max_length = max([item["input_ids"].size(0) for item in batch])

    input_ids = torch.stack(
        [
            torch.cat(
                [item["input_ids"], torch.zeros(max_length - item["input_ids"].size(0))]
            ).long()
            for item in batch
        ]
    )
    attention_mask = torch.stack(
        [
            torch.cat(
                [
                    item["attention_mask"],
                    torch.zeros(max_length - item["attention_mask"].size(0)),
                ]
            ).long()
            for item in batch
        ]
    )
    labels = torch.stack([item["label"] for item in batch])
    satisfaction = torch.stack([item["satisfaction"] for item in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "label": labels,
        "satisfaction": satisfaction,
    }

In [4]:
# モデルの設定
MODEL_NAME = "cl-tohoku/bert-base-japanese-v3"
# MODEL_NAME = "cl-tohoku/bert-large-japanese-v2"
batch_size = 16

# トークナイザーとモデルの読み込み
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(labels)
)

# デバイスの設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# データローダーの作成
train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# オプティマイザと損失関数の定義
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# モデルの訓練
def train_epoch(
    model, data_loader, criterion, optimizer, device
) -> tuple[float, float]:
    model.train()  # モデルを訓練モードに設定
    total_loss = 0
    correct_predictions = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)  # 損失の計算
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_loss += loss.item()

        optimizer.zero_grad()  # 勾配の初期化
        loss.backward()  # 逆伝播
        optimizer.step()  # オプティマイザのステップ

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(
        data_loader
    )


# モデルの評価関数
def eval_model(model, data_loader, criterion, device) -> tuple[float, float]:
    model.eval()  # モデルを評価モードに設定
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)  # 損失の計算
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_loss += loss.item()

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(
        data_loader
    )

In [6]:
# 学習前後の精度を比較
# 学習前の精度
pre_train_accuracy, _ = eval_model(model, test_loader, criterion, device)
print(f"Accuracy before training: {pre_train_accuracy}")

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 25/25 [00:01<00:00, 18.40it/s]

Accuracy before training: 0.12658227848101267





In [7]:
# 訓練と評価の実行
num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_accuracy, train_loss = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    print(f"Train loss: {train_loss}, Train accuracy: {train_accuracy}")

Epoch 1/5


Training: 100%|██████████| 99/99 [00:15<00:00,  6.46it/s]


Train loss: 2.200716941043584, Train accuracy: 0.3688212927756654
Epoch 2/5


Training: 100%|██████████| 99/99 [00:15<00:00,  6.50it/s]


Train loss: 1.3217129502633604, Train accuracy: 0.6501901140684411
Epoch 3/5


Training: 100%|██████████| 99/99 [00:15<00:00,  6.55it/s]


Train loss: 0.9314218341100096, Train accuracy: 0.7477820025348542
Epoch 4/5


Training: 100%|██████████| 99/99 [00:15<00:00,  6.43it/s]


Train loss: 0.7121679388513469, Train accuracy: 0.806083650190114
Epoch 5/5


Training: 100%|██████████| 99/99 [00:15<00:00,  6.44it/s]

Train loss: 0.5485909630854925, Train accuracy: 0.8510773130544993





In [8]:
# 学習後の精度評価
post_train_accuracy, _ = eval_model(model, test_loader, criterion, device)
print(f"Accuracy after training: {post_train_accuracy}")

Evaluating: 100%|██████████| 25/25 [00:01<00:00, 20.98it/s]

Accuracy after training: 0.6430379746835443





In [9]:
# 学習済みモデルの保存
# torch.save(model.state_dict(), "model_bert_large_japanese_v2.pth")
# torch.save(model.state_dict(), "model_bert_base_japanese_v3.pth")

In [10]:
# 予測関数
def predict(
    text: str,
    model: BertForSequenceClassification,
    tokenizer: BertTokenizer,
    device: torch.device,
    top_k: int = 1,
) -> dict:
    """
    テキストのカテゴリと満足度を予測する関数

    Parameters
    ----------
    text : str
        予測対象のテキスト
    model : BertForSequenceClassification
        予測に使用する事前学習済みモデル
    tokenizer : BertTokenizer
        トークナイザー
    device : torch.device
        使用するデバイス (CPUまたはGPU)
    top_k : int, optional
        上位何カテゴリを表示するか (デフォルトは1)

    Returns
    -------
    dict
        予測結果 (カテゴリと満足度)
    """
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    model.eval()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits  # 出力ロジットを取得
    satisfaction = torch.tanh(
        logits[:, -1]
    ).squeeze()  # tanh関数を使用して-1から1の範囲に収める
    # satisfaction = logits[:, -1]

    probs = torch.nn.functional.softmax(logits, dim=1)
    top_probs, top_classes = torch.topk(probs, top_k, dim=1)

    predictions = []
    for i in range(top_k):
        predictions.append(
            {
                "category": labels[top_classes[0][i]],
                "confidence": top_probs[0][i].item(),
            }
        )

    return {
        "predictions": predictions,
        "satisfaction": satisfaction.item(),
    }

In [15]:
# 例
example_texts = [
    "行きたいところにすぐに行ける",
    "子供が喜ぶ施設が多い",
    "食事が美味しい",
    "自然が多い",
    "かなり発展した街なのに映画館がない。ライブハウスがない。",
]
for example_text in example_texts:
    prediction = predict(example_text, model, tokenizer, device)
    print(f"Text: {example_text}")
    print(f"Prediction Category: {prediction['predictions']}")
    # print(f"Satisfaction: {prediction['satisfaction']}")
    print()

Text: 行きたいところにすぐに行ける
Prediction Category: [{'category': '移動・交通', 'confidence': 0.9709092378616333}]

Text: 子供が喜ぶ施設が多い
Prediction Category: [{'category': '医療・福祉', 'confidence': 0.5797423124313354}]

Text: 食事が美味しい
Prediction Category: [{'category': '買物・飲食', 'confidence': 0.3605135977268219}]

Text: 自然が多い
Prediction Category: [{'category': '自然景観', 'confidence': 0.6363059878349304}]

Text: かなり発展した街なのに映画館がない。ライブハウスがない。
Prediction Category: [{'category': '遊び・娯楽', 'confidence': 0.9026836156845093}]

