In [9]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from transformers import AutoModel, AutoTokenizer

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support

from tqdm import tqdm
import glob, pickle

MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
DISTIL_MODEL_NAME = "bandainamco-mirai/distilbert-base-japanese"
print(torch.cuda.is_available())

True


In [5]:
# タスク用Datasetクラスを定義
class LivedoorDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item["labels"] = torch.tensor(self.labels[idx]) # item["label"]でなくitem["labels"]が正しい！
        return item
    
    def __len__(self):
        return len(self.labels)     

In [6]:
# 保存済みDatasetをpklからロード
with open("../../DataSet/livedoor_news_corpus/dataloader/ds_train.pkl", "rb") as f:
    ds_train = pickle.load(f)
with open("../../DataSet/livedoor_news_corpus/dataloader/ds_valid.pkl", "rb") as f:
    ds_valid = pickle.load(f)
with open("../../DataSet/livedoor_news_corpus/dataloader/ds_test.pkl", "rb") as f:
    ds_test = pickle.load(f)

In [13]:
# DataLoaderクラスを作成
batch_size_train = 16
batch_size_val = 64

bt_train = DataLoader(ds_train, batch_size=batch_size_train)
bt_val = DataLoader(ds_valid, batch_size=batch_size_val)
dataloader_dict = {"train": bt_train, "val": bt_val}

## DistilBERTモデル

In [7]:
"""
DistilBERT版
"""
class DistilBertClassifier(nn.Module):
    def __init__(self, pretrained_model):
        super(DistilBertClassifier, self).__init__()
        
        self.distil_bert = pretrained_model
        self.classifier = nn.Linear(in_features=768, out_features=9) #9カテゴリのクラス分類
        
        # 重み初期化
        nn.init.normal_(self.classifier.weight, std=.02)
        nn.init.normal_(self.classifier.bias, 0)
        
    def forward(self, input_ids):
        output = self.distil_bert(input_ids)
                
        # CLSトークンの埋め込みベクトル取得
        vec = output.last_hidden_state[:,0,:]
        output_classifier = self.classifier(vec)
        
        return output_classifier

In [10]:
distil_model = AutoModel.from_pretrained(DISTIL_MODEL_NAME)
distil_classifier = DistilBertClassifier(distil_model)

Some weights of the model checkpoint at bandainamco-mirai/distilbert-base-japanese were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# 学習パラメータの設定
# まずは全部OFF
for param in distil_classifier.parameters():
    param.requires_grad = False

# DistilBERTの最後の層だけ更新ON
# BERT-baseは .encoder.layer[-1]でしたが、
# DistilBERTの場合は、上で構造を確認したように以下のように .transfomer.layer[-1]となります。
for param in distil_classifier.distil_bert.transformer.layer[-1].parameters():
    param.requires_grad = True
# クラス分類のところもON
for param in distil_classifier.classifier.parameters():
    param.requires_grad = True


In [12]:
# 事前学習済の箇所は学習率小さめ、最後の全結合層は大きめにする。
# こちらも忘れずにDistilBERT用に変更
optimizer = optim.Adam([
    {'params': distil_classifier.distil_bert.transformer.layer[-1].parameters(), 'lr': 5e-5},
    {'params': distil_classifier.classifier.parameters(), 'lr': 1e-4}
])

# 損失関数の設定
loss_function = nn.CrossEntropyLoss()

## モデルの定義ここまで　ここから学習ステップの定義
  
  

In [20]:
def train_model(net, dataloader_dict, loss_function, optimizer, num_epochs):
    # GPU利用可能ならそうする
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device) # モデルをGPU or CPUに送る

    torch.backends.cudnn.benchmark = True # 高速化するらしい？

    # 以下、epochループ
    for epoch in tqdm(range(num_epochs)):
        # trainとvalそれぞれのphaseを実施
        for phase in ["train", "val"]:
            batch_size = dataloader_dict[phase].batch_size
            if phase == "train":
                net.train()
            else:
                net.eval()
            
            epoch_loss = .0
            epoch_corrects = 0
            iteration = 1
            
            # 各ミニバッチでの処理
            for batch in (dataloader_dict[phase]):
                inputs = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                
                optimizer.zero_grad()
                
                # feed foward処理
                with torch.set_grad_enabled(phase=="train"):
                    outputs = net(inputs)
                    loss = loss_function(outputs, labels) # CrossEntropyLoss内部でsoftmaxまで実施する
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == "train": # trainではback prop
                        loss.backward()
                        optimizer.step()
                        
                        if (iteration % 50 == 0):
                            acc = (torch.sum(preds == labels.data)).double() / batch_size
                            print(f"It:{iteration:3d}|Loss: {loss.item():.4f}|accuracy:{acc:.4f}")
                            
                iteration += 1
                epoch_loss += loss.item() * batch_size
                epoch_corrects += torch.sum(preds == labels.data)
        
        # epoch毎に評価結果を出力
        epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
        print(f"Epoch {epoch+1} / {num_epochs} | {phase} | Loss: {epoch_loss:.4f} | Acc: {epoch_acc}")
    
    return net #訓練後のモデルを出力    

In [21]:
# ファインチューニング
distil_classifier_trained = train_model(distil_classifier, dataloader_dict, loss_function, optimizer, num_epochs=3)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

It: 50|Loss: 1.2516|accuracy:0.6250
It:100|Loss: 1.5270|accuracy:0.4375
It:150|Loss: 1.2051|accuracy:0.5625
It:200|Loss: 0.9010|accuracy:0.6875
It:250|Loss: 0.8216|accuracy:0.8750
It:300|Loss: 0.8774|accuracy:0.7500
It:350|Loss: 0.9895|accuracy:0.6875


 33%|███████████████████████████▋                                                       | 1/3 [01:51<03:42, 111.33s/it]

Epoch 1 / 3 | val | Loss: 0.8600 | Acc: 0.7584803256445047
It: 50|Loss: 0.7894|accuracy:0.7500
It:100|Loss: 1.1637|accuracy:0.5000
It:150|Loss: 0.7549|accuracy:0.6875
It:200|Loss: 0.6367|accuracy:0.8750
It:250|Loss: 0.5078|accuracy:0.9375
It:300|Loss: 0.6684|accuracy:0.8750
It:350|Loss: 0.7617|accuracy:0.6250


 67%|███████████████████████████████████████████████████████▎                           | 2/3 [03:45<01:52, 112.71s/it]

Epoch 2 / 3 | val | Loss: 0.6313 | Acc: 0.8195386702849389
It: 50|Loss: 0.6367|accuracy:0.8750
It:100|Loss: 0.7394|accuracy:0.7500
It:150|Loss: 0.4501|accuracy:0.9375
It:200|Loss: 0.4950|accuracy:0.8125
It:250|Loss: 0.5746|accuracy:0.8750
It:300|Loss: 0.5423|accuracy:0.8750
It:350|Loss: 0.5275|accuracy:0.7500


100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [05:38<00:00, 112.79s/it]

Epoch 3 / 3 | val | Loss: 0.5417 | Acc: 0.8222523744911804





In [24]:
# pkl保存
with open("./distil_model/distil_model_fine-tuned.pkl", "wb") as f:
    pickle.dump(distil_classifier_trained, f)

## 使ってみる

In [25]:
with open("./distil_model/distil_model_fine-tuned.pkl", "rb") as f:
    trained_distil_model = pickle.load(f)

In [28]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
softmax = nn.Softmax(dim=1)

In [59]:
# テストデータ
sentence = "iPhoneをカメラ代わりに使っているという人は意外に多いのではないだろうか。そんな人はいっそのことiPhoneをカメラにしてしまうケースを付けてみてはどうだろう？GIZMON iCAは、シャッターボタンで撮影することもでき実際のカメラのようにiPhoneを利用することができる。さらにビューファインダーや、自分撮り用のミラーもついている。カメラ機能が上がっているiPhoneをよりカメラらしく利用。ケースとしてもかわいいため、男女問わず利用することができる。価格は3980円。気になる人はチェックしてみよう。GIZMON iCA    ■関連記事・ネットはバカ発見器？　SNSの普及で拡大する実名制に要注意【話題】・スマホを自分で除菌するのが新常識に！？　手軽な小型紫外線滅菌器「UVサニタイザー」がオススメ【売れ筋チェック】・YES！　高須度チェック！　予想の斜め上を行く高須クリニックのスペシャルサイトに困惑【話題】・iPhoneを本格的なカメラに変えるかわいすぎるケースが登場！【売れ筋】・楽しいことにもいつかは終わりが・・・Google「Picnik」終了のお知らせが気になる【話題】"
label = 2


In [66]:
# テキストをBERTモデルへの入力にエンコード
encoding = tokenizer(sentence, return_tensors="pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoding = { k:v.to(device) for k,v in encoding.items() }

with torch.no_grad():
    # 推論
    output = trained_distil_model(encoding["input_ids"])
    # Softmaxで分類スコア（output.logits）から確率へ
    prob = softmax(output)
    # データをCPUに戻す
    probs = prob.cpu().numpy()
    # 予測ラベルとその確率
    predicted_cls = probs.argmax(-1)[0]
    predicted_prob = probs[0][predicted_cls]

# 予測結果と正解
print(f"True Label: {label}")
print(f"Prediction: {predicted_cls}, Prob: {predicted_prob*100:.4}%")

True Label: 2
Prediction: 2, Prob: 83.69%
