<a href="https://colab.research.google.com/github/kumakou/BERT_restaurants_review/blob/main/BERT_restaurants_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 必要なライブラリをダウンロード

!pip install transformers fugashi ipadic pytorch-lightning

In [None]:
import random
import glob
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, AutoTokenizer
import pytorch_lightning as pl

import pandas as pd

# 日本語の事前学習モデル
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

In [None]:
# Google Driveをマウント

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
#csvファイルの読み込み

df = pd.read_csv("/content/drive/MyDrive/BERT/random_review1000.csv")

In [None]:
#トークナイザー
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

bert_sc = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
)
bert_sc = bert_sc.cuda()

In [None]:
#データセットの作成
from torch.utils.data import Dataset, DataLoader

reviews = df["review"].values
labels = df["label"].values

max_length = 128
dataset_for_loader = []

for index, text in enumerate(reviews):
  encoding = tokenizer(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True
        )
  encoding['labels'] = labels[index]
  print(encoding)
  encoding = { k: torch.tensor(v) for k, v in encoding.items() }
  dataset_for_loader.append(encoding)


In [None]:
# データセットの分割
random.shuffle(dataset_for_loader) # ランダムにシャッフル
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train] # 学習データ
dataset_val = dataset_for_loader[n_train:n_train+n_val] # 検証データ
dataset_test = dataset_for_loader[n_train+n_val:] # テストデータ


In [None]:
#データローダーの作成

batch_size = 32  # バッチサイズを指定
shuffle = True  # データをシャッフルするかどうかを指定

dataloader_train = DataLoader(
    dataset_train, batch_size=32, shuffle=True
)
dataloader_val = DataLoader(dataset_val, batch_size=256)
dataloader_test = DataLoader(dataset_test, batch_size=256)


In [None]:
# データセットの作成
class YourDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def get_text_from_input_ids(self, input_ids):  # get_text_from_input_idsをメソッドとして追加する
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        tokens = [token for token in tokens if token not in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]]
        text = tokenizer.convert_tokens_to_string(tokens)
        return text

    def __getitem__(self, idx):
        item = self.data[idx]
        # 'text'キーに元の文字データを追加する
        item['text'] = self.get_text_from_input_ids(item['input_ids'])
        return item

# テストデータの文字列を取得するためのデータセット
dataset = YourDataset(dataset_test)

In [None]:
#学習に使用するモデルの作成
from sklearn.metrics import precision_recall_fscore_support

class BertForSequenceClassification_pl(pl.LightningModule):

    def __init__(self, model_name, num_labels, lr, dataset):
        # model_name: Transformersのモデルの名前
        # num_labels: ラベルの数
        # lr: 学習率

        super().__init__()

        # 引数のnum_labelsとlrを保存。
        # 例えば、self.hparams.lrでlrにアクセスできる。
        # チェックポイント作成時にも自動で保存される。
        self.save_hyperparameters()
        self.dataset = dataset

        # BERTのロード
        self.bert_sc = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

        self.validation_step_outputs = []
        self.training_step_outputs = []
        self.test_step_outputs = []

        print(self.bert_sc)

    # 学習データのミニバッチ(`batch`)が与えられた時に損失を出力する関数を書く。
    # batch_idxはミニバッチの番号であるが今回は使わない。
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss) # 損失を'train_loss'の名前でログをとる。
        self.training_step_outputs.append(output)
        return loss

    # 検証データのミニバッチが与えられた時に、
    # 検証データを評価する指標を計算する関数を書く。
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss) # 損失を'val_loss'の名前でログをとる。
        self.validation_step_outputs.append(output)  # 出力を保存

    # テストデータのミニバッチが与えられた時に、
    # テストデータを評価する指標を計算する関数を書く。
    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels') # バッチからラベルを取得
        input_ids = batch["input_ids"] # テストデータからinput_idsを取得する
        attention_mask = batch['attention_mask']  # テストデータのattention_mask

        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = ( labels_predicted == labels ).sum().item()
        accuracy = num_correct/labels.size(0) #精度

        # 再現率、適合率、F1スコアの計算
        precision, recall, f1, _ = precision_recall_fscore_support(
          labels.cpu(), labels_predicted.cpu(), average='binary'
        )

        self.log('precision', precision) # 適合率を'precision'の名前でログをとる。
        self.log('recall', recall) # 再現率を'recall'の名前でログをとる。
        self.log('f1', f1) # F1値を'f1'の名前でログをとる。

        print(f"precision: {precision}",end="")
        print(f"recall: {recall}",end="")
        print(f"f1: {f1}",end="")
        print(f"accuracy: {accuracy}",end="")

        # 個別に正解データと間違いデータの値を出す
        print("正解データ")
        for i in range(len(input_ids)):
          if labels[i] == labels_predicted[i]:
            input_ids_str = self.dataset.get_text_from_input_ids(input_ids[i])
            print(f"テストデータ: {input_ids_str}",end="")
            print(f"正解ラベル: {labels[i]}",end="")
            print(f"予測ラベル: {labels_predicted[i]}\n")

        print("間違いデータ")
        for i in range(len(input_ids)):
          if labels[i] != labels_predicted[i]:
            input_ids_str = self.dataset.get_text_from_input_ids(input_ids[i])
            print(f"テストデータ: {input_ids_str}", end="")
            print(f"正解ラベル: {labels[i]}", end="")
            print(f"予測ラベル: {labels_predicted[i]}\n")

        self.test_step_outputs.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
        print(self.test_step_outputs)

    def on_train_epoch_end(self):
        avg_train_loss = torch.stack([x.loss for x in self.training_step_outputs]).mean()
        self.log('avg_train_loss', avg_train_loss)
        self.training_step_outputs.clear()  # メモリの解放
    def on_validation_epoch_end(self):
        avg_val_loss = torch.stack([x.loss for x in self.validation_step_outputs]).mean()
        self.log('avg_val_loss', avg_val_loss)
        self.validation_step_outputs.clear()  # メモリの解放

    def on_test_epoch_end(self):
        avg_accuracy = torch.tensor([x['accuracy'] for x in self.test_step_outputs]).mean()
        avg_precision = torch.tensor([x['precision'] for x in self.test_step_outputs]).mean()
        avg_recall = torch.tensor([x['recall'] for x in self.test_step_outputs]).mean()
        avg_f1 = torch.tensor([x['f1'] for x in self.test_step_outputs]).mean()

        self.log('avg_accuracy', avg_accuracy)
        self.log('avg_precision', avg_precision)
        self.log('avg_recall', avg_recall)
        self.log('avg_f1', avg_f1)

        self.test_step_outputs.clear()

    # 学習に用いるオプティマイザを返す関数を書く。
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [None]:
# 6-15
# 学習時にモデルの重みを保存する条件を指定
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

# 学習の方法を指定
trainer = pl.Trainer(
    max_epochs=10,
    callbacks = [checkpoint]
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
# 6-16
# PyTorch Lightningモデルのロード
model = BertForSequenceClassification_pl(
    MODEL_NAME, num_labels=2, lr=1e-5, dataset = dataset
)

# ファインチューニングを行う。
trainer.fit(model, dataloader_train, dataloader_val)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 110 M 
----------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.476   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
# 6-17
best_model_path = checkpoint.best_model_path # ベストモデルのファイル
print('ベストモデルのファイル: ', checkpoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkpoint.best_model_score)

ベストモデルのファイル:  /content/model/epoch=1-step=380.ckpt
ベストモデルの検証データに対する損失:  tensor(0.0278, device='cuda:0')


In [None]:
# 6-18
%load_ext tensorboard
%tensorboard --logdir ./

<IPython.core.display.Javascript object>

In [None]:
# 6-19
test = trainer.test(dataloaders=dataloader_test)

In [None]:
# 6-20
# PyTorch Lightningモデルのロード
model = BertForSequenceClassification_pl.load_from_checkpoint(
    best_model_path
)

# Transformers対応のモデルを./model_transformesに保存
model.bert_sc.save_pretrained('./model_transformers')

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# 6-21
bert_sc = BertForSequenceClassification.from_pretrained(
    './model_transformers'
)