7章と同じくHugging Faceが提供しているライブラリを用います。詳しくはそちらをご覧ください。

## 設定
ここでは、7章と同じディレクトリに新たに必要となるデータを加えています。(データを再び作るのが煩雑であったためです。)

In [None]:
# ディレクトリの設定
from google.colab import drive
drive.mount('/content/drive/')

root_path = '/content/drive/My Drive/your/dir/path/7_nlp_sentiment_transformer'
%cd $root_path

In [None]:
# ライブラリのインストール
#gensimライブラリは使いません
!pip install datasets
# !pip install transformers  # 必要に応じてインストールしてください。Colabだとプレインストールされています

## 前処理等
以下の順番で実装していきます。7章のほうで詳細を述べているので、必要に応じてそちらをご覧ください。

-前処理

-トークン化

-DataLoaderの作成



In [None]:
# 前処理、トークン化、DataLoaderの作成
#7章から一部変更しています

from datasets import load_dataset, DatasetDict
from transformers import BertTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
import re

# 前処理とトークナイズを組み合わせた関数
def preprocess_and_tokenize(examples):

    # テキストの前処理
    examples['text'] = [re.sub('', '', t) for t in examples['text']]
    examples['text'] = [re.sub(r'[^.,a-zA-Z0-9\s]', ' ', t) for t in examples['text']]
    examples['text'] = [t.replace('.', ' . ').replace(',', ' , ') for t in examples['text']]

    # トークナイズとラベルの追加
    tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

    # 'labels' キーを追加
    tokenized_inputs['labels'] = examples['label']

    # 'token_type_ids' が不要な場合は削除
    if 'token_type_ids' in tokenized_inputs:
        del tokenized_inputs['token_type_ids']
    return tokenized_inputs

# 事前学習済みモデルに対応する BertTokenizer のロード
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# データセットの読み込み
tr_path = root_path + '/data/IMDb_train.tsv'
ts_path = root_path + '/data/IMDb_test.tsv'

# データセットの読み込みと前処理
tr_val_ds = load_dataset('csv', data_files=tr_path, delimiter='\t', split='train', column_names=['text', 'label'])
ts_ds = load_dataset('csv', data_files=ts_path, delimiter='\t', split='train', column_names=['text', 'label'])

# 前処理とトークナイズの適用
tr_val_ds = tr_val_ds.map(preprocess_and_tokenize, batched=True)
ts_ds = ts_ds.map(preprocess_and_tokenize, batched=True)

# 不要な列の削除
tr_val_ds = tr_val_ds.remove_columns(['text', 'label'])
ts_ds = ts_ds.remove_columns(['text', 'label'])

# DatasetDictを作成して分割
dataset_dict = DatasetDict({
    'train': tr_val_ds
})
train_test_split = dataset_dict['train'].train_test_split(test_size=0.2)

# 分割されたデータセットを取得
tr_ds = train_test_split['train']
val_ds = train_test_split['test']

# DataCollatorWithPadding のインスタンスを作成
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tr_dl = DataLoader(tr_ds, shuffle=True, batch_size=32, collate_fn=data_collator)
val_dl = DataLoader(val_ds, batch_size=32, collate_fn=data_collator)
ts_dl = DataLoader(ts_ds, batch_size=32, collate_fn=data_collator)


In [None]:
# 動作確認
for batch in ts_dl:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    break

## BERTの実装
ここは書籍とほとんど変わりませんが、細かな変更をしています。そのうち重要なものを挙げます。

-attrdictパッケージの代わりにboxを用いています。Colabにプレインストールされています

-attention_show_flgの仕様を変更しています。フラグがNoneのときにうまく動作しなかったのを改善しています

In [6]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


In [None]:
import json
config_file = root_path + '/weights/bert_config.json'
json_file = open(config_file, 'r')
config    = json.load(json_file)

config

In [8]:
from box import Box

config = Box(config)

In [9]:
class BertLayerNorm(nn.Module):
  def __init__(self, hidden_size, eps=1e-12):
    super().__init__()
    self.gamma = nn.Parameter(torch.ones(hidden_size))
    self.beta  = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

  def forward(self, x):
    u = x.mean(-1, keepdim=True)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.variance_epsilon)
    return self.gamma * x + self.beta


In [10]:
class BertEmbeddings(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.words_embeddings      = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
    self.position_embeddings   = nn.Embedding(config.max_position_embeddings, config.hidden_size)
    self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
    self.LayerNorm             = BertLayerNorm(config.hidden_size, eps=1e-12)
    self.dropout               = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, input_ids, token_type_ids=None):
    words_embeddings = self.words_embeddings(input_ids)
    if token_type_ids is None:
      token_type_ids = torch.zeros_like(input_ids)
    token_type_embeddings = self.token_type_embeddings(token_type_ids)

    seq_length   = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    position_embeddings = self.position_embeddings(position_ids)

    embeddings = words_embeddings + position_embeddings + token_type_embeddings
    embeddings = self.LayerNorm(embeddings)
    embeddings = self.dropout(embeddings)

    return embeddings


In [11]:
class BertLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.attention    = BertAttention(config)
    self.intermediate = BertIntermediate(config)
    self.output       = BertOutput(config)

  def forward(self, hidden_states, attention_mask=None, attention_show_flg=False):
    if attention_show_flg == True:
      attention_output, attention_probs = self.attention(hidden_states, attention_mask, attention_show_flg)
      intermediate_output = self.intermediate(attention_output)
      layer_output = self.output(intermediate_output, attention_output)

      return layer_output, attention_probs

    else:   # Noneのときにエラーが出たので変更しています
      attention_output = self.attention(hidden_states, attention_mask, attention_show_flg)
      intermediate_output = self.intermediate(attention_output)
      layer_output = self.output(intermediate_output, hidden_states)

      return layer_output

class BertAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.selfattn = BertSelfAttention(config)
    self.output   = BertSelfOutput(config)

  def forward(self, input_tensor, attention_mask, attention_show_flg=False):
    if attention_show_flg == True:
      self_output, attention_probs = self.selfattn(input_tensor, attention_mask, attention_show_flg)
      attention_output = self.output(self_output, input_tensor)

      return attention_output, attention_probs

    else:
      self_output = self.selfattn(input_tensor, attention_mask, attention_show_flg)
      attention_output = self.output(self_output, input_tensor)

      return attention_output

class BertSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.num_attention_heads = config.num_attention_heads
    self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
    self.all_head_size       = self.num_attention_heads * self.attention_head_size

    # 全結合層
    self.query = nn.Linear(config.hidden_size, self.all_head_size)
    self.key   = nn.Linear(config.hidden_size, self.all_head_size)
    self.value = nn.Linear(config.hidden_size, self.all_head_size)

    self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

  def transpose_for_scores(self, x):
    # new_x_shape: (batch_size, seq_len, num_attention_heads, attention_head_size)
    new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
    x = x.view(*new_x_shape)

    return x.permute(0, 2, 1, 3)

  def forward(self, hidden_states, attention_mask, attention_show_flg=False):
    mixed_query_layer = self.query(hidden_states)
    mixed_key_layer   = self.key(hidden_states)
    mixed_value_layer = self.value(hidden_states)

    query_layer = self.transpose_for_scores(mixed_query_layer)
    key_layer   = self.transpose_for_scores(mixed_key_layer)
    value_layer = self.transpose_for_scores(mixed_value_layer)

    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
    attention_socres = attention_scores / math.sqrt(self.attention_head_size)
    attention_scores = attention_socres + attention_mask

    attention_probs = nn.Softmax(dim=-1)(attention_scores)
    attention_probs = self.dropout(attention_probs)

    context_layer = torch.matmul(attention_probs, value_layer)

    # 元に戻す
    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()              # メモリ配置の変更
    new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size, )
    context_layer = context_layer.view(*new_context_layer_shape)

    if attention_show_flg == True:
      return context_layer, attention_probs

    else:
      return context_layer

class BertSelfOutput(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense     = nn.Linear(config.hidden_size, config.hidden_size)
    self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
    self.dropout   = nn.Dropout(config.attention_probs_dropout_prob)

  def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)

    return hidden_states

def gelu(x):
  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class BertIntermediate(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
    self.intermediate_act_fn = gelu

  def forward(self, hidden_states):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.intermediate_act_fn(hidden_states)

    return hidden_states

class BertOutput(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense     = nn.Linear(config.intermediate_size, config.hidden_size)
    self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
    self.dropout   = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)

    return hidden_states



In [12]:
class BertEncoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

  def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, attention_show_flg=False):
    all_encoder_layers = []

    for layer_module in self.layer:
      if attention_show_flg == True:
        hidden_states, attention_probs = layer_module(hidden_states, attention_mask, attention_show_flg)
      else:
        hidden_states = layer_module(hidden_states, attention_mask, attention_show_flg)

      if output_all_encoded_layers:
        all_encoder_layers.append(hidden_states)

    if not output_all_encoded_layers:
      all_encoder_layers.append(hidden_states)

    if attention_show_flg == True:
      return all_encoder_layers, attention_probs

    else:
      return all_encoder_layers



In [13]:
class BertPooler(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    self.activation = nn.Tanh()

  def forward(self, hidden_states):
    first_token_tensor = hidden_states[:, 0]
    pooled_output = self.dense(first_token_tensor)
    pooled_output = self.activation(pooled_output)

    return pooled_output


In [None]:
# 動作確認
# 入力
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
print("Tensor size of input ids: ", input_ids.shape)

# マスク
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
print("Tensor size of input mask: ", attention_mask.shape)

# 文章ID
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])
print("Tensor size of input docs.ID: ", token_type_ids.shape)

# モジュール
embeddings = BertEmbeddings(config)
encoder    = BertEncoder(config)
pooler     = BertPooler(config)

# マスクの変形 -> [batch_size, 1, 1, seq_length]
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
print("Tensor size of extended mask: ", extended_attention_mask.shape)

# forward
out1 = embeddings(input_ids, token_type_ids)
print("Tensor size of output of BertEmbeddings: ", out1.shape)

out2 = encoder(out1, extended_attention_mask)
print("Tensor size of final layer output of BertEncoder: ", out2[0].shape)

out3 = pooler(out2[-1])
print("Tensor size of output of BertPooler: ", out3.shape)

In [15]:
class BertModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = BertEmbeddings(config)
    self.encoder    = BertEncoder(config)
    self.pooler     = BertPooler(config)

  def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, attention_show_flg=False):
    if attention_mask is None:
      attention_mask = torch.ones_like(input_ids)

    if token_type_ids is None:
      token_type_ids = torch.zeros_like(input_ids)

    extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
    extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

    embedding_output = self.embeddings(input_ids, token_type_ids)
    if attention_show_flg == True:
      encoded_layers, attention_probs = self.encoder(embedding_output,
                                                     extended_attention_mask,
                                                     output_all_encoded_layers,
                                                     attention_show_flg)
    else:
      encoded_layers = self.encoder(embedding_output,
                                    extended_attention_mask,
                                    output_all_encoded_layers,
                                    attention_show_flg)

    pooled_output = self.pooler(encoded_layers[-1])

    if not output_all_encoded_layers:
      encoded_layers = encoded_layers[-1]

    if attention_show_flg == True:
      return encoded_layers, pooled_output, attention_probs

    else:
      return encoded_layers, pooled_output




In [None]:
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])

net = BertModel(config)

encoded_layers, pooled_output, attention_probs = net(input_ids,
                                                     token_type_ids,
                                                     attention_mask,
                                                     output_all_encoded_layers=False,
                                                     attention_show_flg=True)

print("encoded_layersのテンソルサイズ: ", encoded_layers.shape)
print("pooled_outputのテンソルサイズ: ", pooled_output.shape)
print("attention_probsのテンソルサイズ: ", attention_probs.shape)

## BERTを用いたベクトル表現の比較（bank:銀行とbank:土手）
ここもほとんど変更点はありません。一部、細かな修正をしています。

In [None]:
# 学習済みモデルをロード
weights_path = root_path + "/weights/pytorch_model.bin"
loaded_state_dict = torch.load(weights_path)

for s in loaded_state_dict:
  print(s)

In [None]:
net = BertModel(config)
net.eval()

# 現在のネットワークモデルのパラメータ名
params_names = []

for name, param in net.named_parameters():
  print(name)
  params_names.append(name)

In [None]:
# パラメータ名を変更する
new_state_dict = net.state_dict().copy()

for index, (key_name, value) in enumerate(loaded_state_dict.items()):
  name = params_names[index]
  new_state_dict[name] = value
  print(str(key_name)+"→"+str(name))

  if index+1 >= len(params_names):
    break

net.load_state_dict(new_state_dict)

In [None]:
# ボキャブラリの確認
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

vocab_list = list(tokenizer.vocab.keys())
print(vocab_list[100])

In [None]:
# 文脈による意味変化
text_1 = "[CLS] I accessed the bank account. [SEP]"
text_2 = "[CLS] He transferrd the deposit money into the bank account. [SEP]"
text_3 = "[CLS] We play soccer at the bank of the river. [SEP]"

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)
tokenized_text_3 = tokenizer.tokenize(text_3)

print(tokenized_text_1)
print(tokenized_text_2)
print(tokenized_text_3)

In [None]:
indexed_text_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_text_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
indexed_text_3 = tokenizer.convert_tokens_to_ids(tokenized_text_3)

bank_posi_1 = np.where(np.array(tokenized_text_1) == "bank")[0][0]
bank_posi_2 = np.where(np.array(tokenized_text_2) == "bank")[0][0]
bank_posi_3 = np.where(np.array(tokenized_text_3) == "bank")[0][0]

tokens_tensor_1 = torch.tensor([indexed_text_1])
tokens_tensor_2 = torch.tensor([indexed_text_2])
tokens_tensor_3 = torch.tensor([indexed_text_3])

bank_word_id = tokenizer.convert_tokens_to_ids(["bank"])[0]
print(tokens_tensor_1)
print(tokens_tensor_2)
print(tokens_tensor_3)

In [23]:
# 文章をBERTで処理
attention_mask_1 = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1]])
token_type_ids_1 = torch.LongTensor([[0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask_2 = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
token_type_ids_2 = torch.LongTensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask_3 = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
token_type_ids_3 = torch.LongTensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

net = BertModel(config)
net.eval()

with torch.no_grad():
    encoded_layers_1, _ = net(tokens_tensor_1, token_type_ids_1, attention_mask_1, output_all_encoded_layers=True, attention_show_flg=False)
    encoded_layers_2, _ = net(tokens_tensor_2, token_type_ids_2, attention_mask_2, output_all_encoded_layers=True, attention_show_flg=False)
    encoded_layers_3, _ = net(tokens_tensor_3, token_type_ids_3, attention_mask_3, output_all_encoded_layers=True, attention_show_flg=False)


In [24]:
# bankの初期の単語ベクトル表現
# これはEmbeddingsモジュールから取り出し、単語bankのidに応じた単語ベクトルなので3文で共通している
bank_vector_0 = net.embeddings.words_embeddings.weight[bank_word_id]

# 文章1のBertLayerモジュール1段目から出力されるbankの特徴量ベクトル
bank_vector_1_1 = encoded_layers_1[0][0, bank_posi_1]

# 文章1のBertLayerモジュール最終12段目から出力されるのbankの特徴量ベクトル
bank_vector_1_12 = encoded_layers_1[11][0, bank_posi_1]

# 文章2、3も同様に
bank_vector_2_1 = encoded_layers_2[0][0, bank_posi_2]
bank_vector_2_12 = encoded_layers_2[11][0, bank_posi_2]
bank_vector_3_1 = encoded_layers_3[0][0, bank_posi_3]
bank_vector_3_12 = encoded_layers_3[11][0, bank_posi_3]

In [None]:
print("bankの初期ベクトル と 文章1の1段目のbankの類似度：",
      F.cosine_similarity(bank_vector_0, bank_vector_1_1, dim=0))
print("bankの初期ベクトル と 文章1の12段目のbankの類似度：",
      F.cosine_similarity(bank_vector_0, bank_vector_1_12, dim=0))

print("文章1の1層目のbank と 文章2の1段目のbankの類似度：",
      F.cosine_similarity(bank_vector_1_1, bank_vector_2_1, dim=0))
print("文章1の1層目のbank と 文章3の1段目のbankの類似度：",
      F.cosine_similarity(bank_vector_1_1, bank_vector_3_1, dim=0))

print("文章1の12層目のbank と 文章2の12段目のbankの類似度：",
      F.cosine_similarity(bank_vector_1_12, bank_vector_2_12, dim=0))
print("文章1の12層目のbank と 文章3の12段目のbankの類似度：",
      F.cosine_similarity(bank_vector_1_12, bank_vector_3_12, dim=0))

## BERTによる感情分析
上と同じくattention_show_flgの仕様を変更しています。また、一部細かな修正をしています。

In [26]:
class BertForIMDb(nn.Module):
  def __init__(self, net):
    super().__init__()
    self.bert = net

    #headに分類アダプタを追加
    self.cls = nn.Linear(in_features=768, out_features=2)

    #重みの初期化
    nn.init.normal_(self.cls.weight, std=0.02)
    nn.init.normal_(self.cls.bias, 0)

  def forward(self,
              input_ids,
              token_type_ids=None,
              attention_mask=None,
              output_all_encoded_layers=False,
              attention_show_flg=False
              ):

    if attention_show_flg == True:
      encoded_layers, pooled_output, attention_probs = self.bert(input_ids,
                                                                 token_type_ids,
                                                                 attention_mask,
                                                                 output_all_encoded_layers,
                                                                 attention_show_flg
                                                                 )

    else:
      encoded_layers, pooled_output = self.bert(input_ids,
                                                 token_type_ids,
                                                 attention_mask,
                                                 output_all_encoded_layers,
                                                 attention_show_flg
                                                 )

    vec_0 = encoded_layers[:, 0, :]
    vec_0 = vec_0.view(-1, 768)
    out   = self.cls(vec_0)

    if attention_show_flg == True:
      return out, attention_probs

    else:
      return out



In [27]:
# 学習・検証データの準備
dl_dict = {'train': tr_dl, 'val': val_dl}

In [None]:
model = BertForIMDb(net)
model.train()
print("Network configuration complete!!")

In [29]:
# 計算時間を考慮して最終層のBertLayerのみを学習する
# すべての勾配計算を無効化
for param in model.parameters():
  param.requires_grad = False

# 最後のBertLayerの勾配計算を有効化
for name, param in model.bert.encoder.layer[-1].named_parameters():
  param.requires_grad = True

# 分類アダプタの勾配計算を有効化
for name, param in model.cls.named_parameters():
  param.requires_grad = True


In [30]:
# 最適化手法と損失関数の定義（論文に準拠）
optimizer = torch.optim.Adam([
    {'params': model.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': model.cls.parameters(), 'lr': 5e-5}
], betas=(0.9, 0.999))

criterion = nn.CrossEntropyLoss()

In [35]:
import time
from tqdm import tqdm

def train_model(model, dl_dict, criterion, optimizer, num_epochs):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  compute_device = "GPU" if device.type == "cuda" else "CPU"
  print(f"This is {compute_device} trainer !!")
  print("-"*20, "Start", "-"*20)

  model.to(device)

  # network acceleration
  torch.backends.cudnn.benchmark = True

  # batch_size
  batch_size = 32

  for epoch in range(num_epochs):
    for phase in ['train', 'val']:
      if phase == 'train':
        model.train()
      else:
        model.eval()

      epoch_loss     = 0.0
      epoch_corrects = 0
      iteration      = 0

      # 開始時刻を保存
      t_epoch_start = time.time()
      t_iter_start  = time.time()

      for batch in tqdm(dl_dict[phase]):
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with torch.set_grad_enabled(phase == 'train'):
          outputs = model(inputs,
                          token_type_ids=None,
                          attention_mask=None,
                          output_all_encoded_layers=False,
                          attention_show_flg=None
                          )

          loss = criterion(outputs, labels)
          _, preds = torch.max(outputs, 1)

          if phase == 'train':
            loss.backward()
            optimizer.step()

            if (iteration % 10 == 0):
              t_iter_finish = time.time()
              duration = t_iter_finish - t_iter_start
              acc = torch.sum(preds == labels.data).double() / batch_size
              print("Iteration {} || Loss: {:.4f} || 10 iter: {:.4f} sec. || Accuracy rate in this iteration: {}".format(
                  iteration,
                  loss.item(),
                  duration,
                  acc
              ))
              t_iter_start = time.time()

          iteration += 1


          epoch_loss += loss.item() * batch_size
          epoch_corrects += torch.sum(preds == labels.data)

      # epochごとのlossと正解率
      t_epoch_finish = time.time()
      epoch_loss = epoch_loss / len(dl_dict[phase].dataset)
      epoch_acc  = epoch_corrects.double() / len(dl_dict[phase].dataset)

      print('Epoch {}/{} | {:^5} | Loss: {:.4f} ACC: {:.4f}'.format(
          epoch + 1,
          num_epochs,
          phase,
          epoch_loss,
          epoch_acc
      ))
      t_epoch_start = time.time()

  return model



In [None]:
%%time
#書籍のような学習結果を出せませんでした
num_epochs = 2
model_trained = train_model(model, dl_dict, criterion, optimizer, num_epochs=num_epochs)

In [37]:
# 重みの保存
save_path = root_path + '/weights/bert_fine_tuning_IMDb.pth'
torch.save(model_trained.state_dict(), save_path)


In [None]:
# テストデータでの検証
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_trained.eval()
model_trained.to(device)

epoch_corrects = 0

for batch in tqdm(ts_dl):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  inputs = batch['input_ids'].to(device)
  labels = batch['labels'].to(device)

  # 順伝播
  with torch.set_grad_enabled(False):

    # モデルに入力
    outputs = model_trained(
        inputs,
        token_type_ids=None,
        attention_mask=None,
        output_all_encoded_layers=False,
        attention_show_flg=None
    )

    loss = criterion(outputs, labels)
    _, preds = torch.max(outputs, 1)
    epoch_corrects += torch.sum(preds == labels.data)

epoch_acc = epoch_corrects.double() / len(ts_dl.dataset)
print('テストデータ{}個での正解率: {:.4f}'.format(len(ts_dl.dataset), epoch_acc))

## Attentionの可視化
データの取り出し方を変更しています。また、細かな修正をしています。

In [55]:
batch = next(iter(ts_dl))
inputs = batch['input_ids'].to(device)
labels = batch['labels'].to(device)

outputs, attention_probs = model_trained(
    inputs,
    token_type_ids=None,
    attention_mask=None,
    output_all_encoded_layers=False,
    attention_show_flg=True
)

_, preds = torch.max(outputs, 1)


In [81]:
# HTMLを作成する関数を実装

def highlight(word, attn):
    "Attentionの値が大きいと文字の背景が濃い赤になるhtmlを出力させる関数"

    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)


def mk_html(index, ts_ds, preds, normalized_weights):
    "HTMLデータを作成する"

    # indexの結果を抽出
    # indexの結果を抽出
    example  = ts_ds[index]  # データセットからサンプルを取得
    sentence = tokenizer.convert_ids_to_tokens(example['input_ids'])  # 文章
    label    = example['labels']  # ラベル
    pred     = preds[index]  # 予測

    # ラベルと予測結果を文字に置き換え
    if label == 0:
        label_str = "Negative"
    else:
        label_str = "Positive"

    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)

    # Multi-head Attentionを考慮
    for i in range(12):

      # indexのAttentionを抽出と規格化
      #0単語目[CLS]の, i番目のMulti-head Attentionを取り出す
      # indexはミニバッチの何個目のデータなのかを示す

      attens = normalized_weights[index, i, 0, :]
      attens /= attens.max()

      html += '[BERTのAttentionを可視化_' + str(i+1) + ']<br>'
      for word, attn in zip(sentence, attens):

        #[SEP]は文章の終わりなのでbreak
        if word == '[SEP]':
            html += '<br><br>'
            break
        word_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(word)])
        html += highlight(tokenizer.convert_ids_to_tokens(word_tensor.numpy().tolist())[0], attn)
    html += "<br><br>"

    # 12種類のAttentionの平均を求める。最大値で規格化
    all_attens = attens*0
    for i in range(12):
      all_attens += normalized_weights[index, i, 0, :]
    all_attens /= all_attens.max()

    html += '[BERTのAttentionを可視化_ALL]<br>'
    for word, attn in zip(sentence, all_attens):

    # 単語が[SEP]の場合は文章が終わりなのでbreak
      if word == "[SEP]":
          break

    # 関数highlightで色をつける、関数tokenizer.convert_ids_to_tokensでIDを単語に戻す
      word_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(word)])
      html += highlight(tokenizer.convert_ids_to_tokens(word_tensor.numpy().tolist())[0], attn)
    html += "<br><br>"

    return html

In [82]:
from IPython.display import HTML

index = 1
html_output = mk_html(index, ts_ds, preds, attention_probs)
HTML(html_output)

以上