In [3]:
# ====================
# 89. 事前学習済み言語モデルからの転移学習
# ====================
# ライブラリのインストール
! pip install --quiet torch==1.6.0
! pip install --quiet transformers

# ライブラリの読み込み
import os
import pandas as pd
import time
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split


# データセットのダウンロード
if os.path.isfile("/content/NewsAggregatorDataset.zip") == False:
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
    ! unzip NewsAggregatorDataset.zip
    # 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
    ! sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv
df = pd.read_csv('/content/newsCorpora.csv', sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df1 = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割 stratifyを設定することで訓練データとテストデータの指定した中身の割合を同じにすることができる
train, temp = train_test_split(df1, test_size=0.2, shuffle=True, random_state=0, stratify=df1['CATEGORY'])
test, valid = train_test_split(temp, test_size=0.5, shuffle=True, random_state=0, stratify=temp['CATEGORY'])

# データの保存
! mkdir -p /content/data/
train.to_csv('/content/data/train.txt', sep="\t", index=False)
test.to_csv('/content/data/test.txt', sep="\t", index=False)
valid.to_csv('/content/data/valid.txt', sep="\t", index=False)

--2022-03-10 12:16:48--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip’


2022-03-10 12:16:48 (61.0 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
# BERT
class BERTClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(drop_rate)
        self.fc = torch.nn.Linear(768, otuput_size)  # BERTの出力に合わせて768次元を指定

    def forward(self, ids, mask):
        _, out = self.bert(ids, attention_mask=mask, return_dict=False)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [5]:
# データセットの作成
class CreateDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_len):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        text = self.x[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.LongTensor(ids),
            'mask': torch.LongTensor(mask),
            'labels': torch.Tensor(self.y[index])
        }

In [6]:
# lossと正解率の計算
def calculate_loss_accuracy(model, dataloader, criterion=None, device=None):
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in dataloader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)
            outputs = model(ids, mask)
            if criterion is not None:
                loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1).cpu().numpy()
            labels = torch.argmax(labels, dim=-1).cpu().numpy()
            total += len(labels)
            correct += len([True for p, l in zip(pred, labels) if p == l])

    return loss / len(dataloader), correct / total

In [7]:
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, device=None):
    model.to(device)

    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)

    for epoch in range(num_epochs):
        start = time.time()

        model.train()
        for data in dataloader_train:
            optimizer.zero_grad()
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)
            outputs = model(ids, mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        loss_train, acc_train = calculate_loss_accuracy(model, dataloader_train, criterion, device)
        loss_valid, acc_valid = calculate_loss_accuracy(model, dataloader_valid, criterion, device)

        elapsed_time = time.time() - start

        print(f'epoch: {epoch + 1}, '
              f'loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, '
              f'loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}, '
              f'{elapsed_time:.4f}sec')

In [8]:
train = pd.read_table('/content/data/train.txt')
valid = pd.read_table('/content/data/valid.txt')
test = pd.read_table('/content/data/test.txt')

y_train = pd.get_dummies(train, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values
y_valid = pd.get_dummies(valid, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values
y_test = pd.get_dummies(test, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values

max_len = 30
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_train = CreateDataset(train['TITLE'], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid['TITLE'], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test['TITLE'], y_test, tokenizer, max_len)

# パラメータの設定
DROP_RATE = 0.4
OUTPUT_SIZE = 4
BATCH_SIZE = 16
NUM_EPOCHS = 4
LEARNING_RATE = 2e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BERTClass(DROP_RATE, OUTPUT_SIZE)

criterion = torch.nn.BCEWithLogitsLoss()

optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, DEVICE)

dataloader_train = DataLoader(dataset_train, batch_size=len(dataset_train), shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)

_, acc_train = calculate_loss_accuracy(model, dataloader_train, device=DEVICE)
_, acc_test = calculate_loss_accuracy(model, dataloader_test, device=DEVICE)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 1, loss_train: 0.0756, accuracy_train: 0.9546, loss_valid: 0.1120, accuracy_valid: 0.9213, 72.7782sec
epoch: 2, loss_train: 0.0410, accuracy_train: 0.9765, loss_valid: 0.1090, accuracy_valid: 0.9205, 72.1284sec
epoch: 3, loss_train: 0.0229, accuracy_train: 0.9892, loss_valid: 0.1032, accuracy_valid: 0.9333, 71.3996sec
epoch: 4, loss_train: 0.0173, accuracy_train: 0.9918, loss_valid: 0.1196, accuracy_valid: 0.9273, 71.4207sec
0.9918478260869565
0.9242878560719641


In [10]:
print(f'正解率（訓練データ）：{acc_train}')
print(f'正解率（評価データ）：{acc_test}')

正解率（訓練データ）：0.9918478260869565
正解率（評価データ）：0.9242878560719641
