In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [2]:
def make_dataset(fname):
    with open(fname) as file:
        titles = []
        labels = []
        
        for line in file:
            line = line.rstrip('\n')
            category = line.strip('\t')[0]
            title = line.split('\t')[1]
            
            # titlesの処理
            titles.append(title)
            
            # labelsの処理
            if category == 'b':
                labels.append(0)
            elif category == 't':
                labels.append(1)
            elif category == 'e':
                labels.append(2)
            elif category == 'm':
                labels.append(3)
                
    return titles, labels

In [3]:
def get_loss_and_accyracy(model, titles, labels):
    running_loss = 0
    correct_count = 0
    
    for title, label in zip(titles, labels):
        input_id = torch.tensor(tokenizer.encode(title, add_special_tokens=True)).unsqueeze(0)
        label = torch.tensor([label])
        
        outputs = model(input_id)
        loss = criterion(outputs[0], label)
        running_loss += loss
        pre_label = torch.max(outputs[0], 1)[1]
        if pre_label == label:
            correct_count += 1
            
    return running_loss / len(labels), correct_count / len(labels)

In [22]:
import torch.optim as optim

model = pretrained_model
for param in model.parameters():
    param.requires_grad = False
classifier_input = model.classifier.in_features
model.classifier = nn.Linear(classifier_input, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=pow(10, -3)) # lr: 検証の結果、10エポック目で最も精度が高かった数値

titles, labels = make_dataset('../chapter_6/train.txt')
titles_test, labels_test = make_dataset('../chapter_6/test.txt')

for epoch in range(10):
    running_loss = 0
    correct_count = 0
    
    for title, label in zip(titles, labels):
        input_id = torch.tensor(tokenizer.encode(title, add_special_tokens=True)).unsqueeze(0)
        label = torch.tensor([label])
        
        optimizer.zero_grad()
        outputs = model(input_id)
        loss = criterion(outputs[0], label)
        loss.backward()
        optimizer.step()
        running_loss += loss
        pre_label = torch.max(outputs[0], 1)[1]
        if pre_label == label:
            correct_count += 1
    print('[{0}] 訓練データ上の損失: {1}\t訓練データ上の正解率: {2}'.format(epoch + 1, running_loss / len(labels), correct_count / len(labels)))
print('')

print('RESULT')
loss_test, accuracy_test = get_loss_and_accyracy(model, titles_test, labels_test)
print('評価データ上の損失: {0}\t評価データ上の正解率: {1}'.format(loss_test, accuracy_test))

[1] 訓練データ上の損失: 1.1112159490585327	訓練データ上の正解率: 0.5405734632683659
[2] 訓練データ上の損失: 1.0670416355133057	訓練データ上の正解率: 0.5763680659670165
[3] 訓練データ上の損失: 1.049731969833374	訓練データ上の正解率: 0.5868628185907047
[4] 訓練データ上の損失: 1.0391135215759277	訓練データ上の正解率: 0.5909857571214393
[5] 訓練データ上の損失: 1.0314867496490479	訓練データ上の正解率: 0.594171664167916
[6] 訓練データ上の損失: 1.025572419166565	訓練データ上の正解率: 0.597263868065967
[7] 訓練データ上の損失: 1.0207574367523193	訓練データ上の正解率: 0.5988568215892054
[8] 訓練データ上の損失: 1.0167187452316284	訓練データ上の正解率: 0.6013868065967016
[9] 訓練データ上の損失: 1.0132509469985962	訓練データ上の正解率: 0.6037293853073463
[10] 訓練データ上の損失: 1.0102144479751587	訓練データ上の正解率: 0.6052286356821589

RESULT
評価データ上の損失: 0.8935148119926453	評価データ上の正解率: 0.5727136431784108
