In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchtext

from transformers import BertTokenizer, BertForSequenceClassification , BertModel , AutoTokenizer, BertForSequenceClassification


import math
import time

In [5]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()

        # 日本語学習済モデルをロードする
        # output_attentions=Trueで順伝播のときにattention weightを受け取れるようにする
        # output_hidden_state=Trueで12層のBertLayerの隠れ層を取得する
        self.bert = model = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', return_dict=True)

        # headにポジネガ予測を追加
        # 入力はBERTの出力特徴量の次元、出力はポジ・ネガの2つ
        self.cls = nn.Linear(in_features=768, out_features=2)

        # 重み初期化処理
        nn.init.normal_(self.cls.weight, std=0.02)
        nn.init.normal_(self.cls.bias, 0)

    # clsトークンのベクトルを取得する用の関数を用意
    def _get_cls_vec(self, vec):
        return vec[:,0,:].view(-1, 768)

    def forward(self, input_ids):

        # 順伝播の出力結果は辞書形式なので、必要な値のkeyを指定して取得する
        output = self.bert(input_ids)
        #attentions = output['attentions']
        hidden_states = output['hidden_states']

        # 最終４層の隠れ層からそれぞれclsトークンのベクトルを取得する
        vec1 = self._get_cls_vec(hidden_states[-1])
        vec2 = self._get_cls_vec(hidden_states[-2])
        vec3 = self._get_cls_vec(hidden_states[-3])
        vec4 = self._get_cls_vec(hidden_states[-4])

        # 4つのclsトークンを結合して１つのベクトルにする。
        vec = torch.cat([vec1, vec2, vec3, vec4], dim=1)

        # 全結合層でクラス分類用に次元を変換
        out = self.linear(vec)
        

        return F.log_softmax(out, dim=1)#, attentions


# モデル構築
classifier = BertClassifier()
classifier = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', return_dict=True)
# 訓練モードに設定
classifier.train()

print('ネットワーク設定完了')


Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

ネットワーク設定完了


In [55]:
from transformers import BertModel
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# BERTはサブワードを含めて最大512単語まで扱える
MAX_LENGTH = 512
def bert_tokenizer(text):
    return tokenizer.encode(text, max_length=MAX_LENGTH, truncation=True, return_tensors='pt')[0]

#フィールドの設定
TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=bert_tokenizer, use_vocab=False, lower=False,
                            include_lengths=True, batch_first=True, fix_length=MAX_LENGTH, pad_token=0)
LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

train_data, test_data = torchtext.legacy.data.TabularDataset.splits(
    path='./', train='train.tsv', test='test.tsv', format='tsv', fields=[('Text', TEXT),('Label', LABEL)])

# BERTではミニバッチサイズは16か32が推奨される
BATCH_SIZE = 8
train_iter, test_iter = torchtext.legacy.data.Iterator.splits((train_data, test_data), batch_sizes=(BATCH_SIZE, BATCH_SIZE), repeat=False, sort=False)

dataloaders_dict = {"train": train_iter, "val": test_iter}





In [54]:

# 動作確認 検証データのデータセットで確認
batch = next(iter(test_iter))
print(batch.Text)
print(batch.Label)
print(len(batch.Text[0][0]))
len(batch.Text[0][1])

TypeError: ignored

In [69]:
# ミニバッチの1文目を確認してみる
text_minibatch_1 = (batch.Text[0][1]).numpy()

# IDを単語に戻す
text = tokenizer.convert_ids_to_tokens(text_minibatch_1)

print(text)
a = tokenizer.encode_plus(text, return_tensors='pt', padding=True, truncation=True)
#len(a['attention_mask']),len(a['input_ids']),type(a['attention_mask'])
print(a)
len(batch.Text[0][1])

['[CLS]', '当', 'セグメント', 'は', '、', '比較的', '長期', 'に', '渡り', '営業', 'を', '継続', 'し', 'て', 'いる', '店舗', 'が', '多数', 'を', '占め', 'て', 'おり', '、', '店舗', '設備', 'の', '老朽', '化', '及び', '市場', 'における', '陳', '##腐', '化', 'が', '進行', 'し', 'て', 'いる', 'と', '考え', 'て', 'おり', 'ます', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

512

In [46]:
# 訓練パートの定義
def train(model):
    model.train() # 訓練モードで実行
    train_loss = 0
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for batch in train_iter:# train_dataloaderはword_id, mask, labelを出力する点に注意
        attention_mask = [1] * len(batch.Text[0][0])
        print(attention_mask)
        input_ids = batch.Text[0].to(device)
        labels = batch.Label[0].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, token_type_ids = None,attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        #log_str = "loss値は「%s」" % str(loss.item())
        #print(log_str)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
    return train_loss

# テストパートの定義
def validation(model):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model.eval()# 検証モードに切り替え
  val_loss = 0
  with torch.no_grad(): # 訓練じゃないので勾配を計算しない
    for batch in test_iter:
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      with torch.no_grad():
        (loss, logits) = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask,
                              labels=b_labels)
        val_loss += loss.item()
    return val_loss

In [47]:
# 学習の実行
del classifier
torch.cuda.empty_cache()

classifier = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', return_dict=True)

from transformers import AdamW
optimizer = AdamW(classifier.parameters(), lr=1e-5) # AdamWオプティマイザ

max_epoch = 20
train_loss_ = []
test_loss_ = []

#エポック数の設定
for epoch in range(max_epoch):
    train_ = train(classifier)
    train_loss_.append(train_)

    print(train_)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

AttributeError: ignored

In [2]:
train = pd.read_csv('train.tsv',sep = '\t',header = None)
test = pd.read_csv('test.tsv',sep = '\t',header = None)
train

Unnamed: 0,0,1,2
0,当社グループを取り巻く環境は、実質賃金が伸び悩むなか、消費者の皆様の生活防衛意識の高まりや節...,0,
1,春から夏にかけましては個人消費の低迷などにより、きのこの価格は厳しい状況で推移いたしました,0,
2,台湾の現地法人「台灣北斗生技股份有限公司」におきましては、ブランドの構築、企画提案などに力を...,0,
3,化成品事業におきましては、引き続き厳しい販売環境にありましたが、中核である包装資材部門におき...,0,
4,以上の結果、化成品事業の売上高は92億45百万円（同1.7％減）となりました,0,
...,...,...,...
1965,不動産業において、不動産賃貸では、当社グループが所有する賃貸用不動産は、入居率、稼働率ともに...,1,
1966,"その結果、売上高は590百万円（同3.4％増）となり、その他の事業を含めた営業収益は3,22...",1,
1967,また、貸倒引当金戻入額112百万円などの特別利益126百万円を計上したため、親会社株主に帰属...,1,
1968,当連結会計年度のスポーツ施設提供業の営業収益は450百万円（前連結会計年度比1.7％増）、セ...,1,


In [3]:
text = list(train[0])
label = list(train[1])

In [4]:
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from sklearn.model_selection import train_test_split

#tensor型に変換
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
label = torch.tensor(label)

# データセットクラスの作成
# なんでTensorDataset使ってデータを分割しているかって？
# 分割しなくても良いけど、その場合、メモリが16G程度だとすぐにRAMがいっぱいになてしまうんだ
dataset = TensorDataset(input_ids, attention_mask, label)


#トレーニングデータ、評価データを9:1の割合で分割する
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# データセットを訓練用と検証用に分ける
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(train_dataset[0])
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100,
#                                                    stratify=y)

print('訓練データ数：{}'.format(train_size))
print('検証データ数:　{} '.format(val_size))

# データローダーの作成
batch_size = 1

# 訓練データローダー
train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), # ランダムにデータを取得してバッチ化
            batch_size = batch_size
        )

# 検証データローダー
validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), # 順番にデータを取得してバッチ化
            batch_size = batch_size
        )

(tensor([    2, 17035, 28444,   986,  1146,   848,   364,   362, 28450,     6,
        17035,  2444,    72,    85,    77,   144,     5,   615,   208,    11,
          848,   364,  5160, 15076, 28462, 28536, 28446, 14066,     6,   136,
         1151,   276,    14, 27044,   429,     1,   827,    53,     1,   174,
          453,  1107,   701,   815,   143,    76,     1,   615,     1,     6,
           19,   284,   264,  1528,   107,    14, 27044, 28508,  1779,     1,
          827,     1,  1625,   429,   758,     1,   174,   453,   701,  1832,
          143,    48,     1,   615,     1,    13,  1265,    42,  6785, 28454,
        11214, 28447,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,

  
  import sys


In [6]:
model = BertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', return_dict=True)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [7]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5) # AdamWオプティマイザ

In [8]:
# 訓練パートの定義
def train(model,train_dataloader):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.train() # 訓練モードで実行
    model.to(device)
    train_loss = 0


    for batch in train_dataloader:# train_dataloaderはword_id, mask, labelを出力する点に注意
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        #outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        #log_str = "loss値は「%s」" % str(loss.item())
        #print(log_str)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
    return train_loss

# テストパートの定義
def validation(model):
    model.eval()# 検証モードに切り替え
    val_loss = 0
    with torch.no_grad(): # 訓練じゃないので勾配を計算しない
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            with torch.no_grad():        
                (loss, logits) = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            val_loss += loss.item()
    return val_loss


In [None]:
# 学習の実行
max_epoch = 20
train_loss_ = []
test_loss_ = []

#エポック数の設定
for epoch in range(max_epoch):
    train_ = train(model,train_dataloader)
    train_loss_.append(train_)

    print(train_)

1188.7784571497468
669.6249003758057


NameError: ignored