In [1]:
import os
import numpy as np
import math
import matplotlib.pyplot as plt
import json
import pandas as pd
from IPython.display import display
from tqdm import tqdm, tqdm_notebook, trange
import sentencepiece as spm
import wget
import import_ipynb
import BERT

import torch
import torch.nn as nn
import torch.nn.functional as F

importing Jupyter notebook from BERT.ipynb


In [2]:
vocab_file = "/home/studio/바탕화면/web-crawler/kowiki/vocab_32000/kowiki.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

True

In [3]:
class BinaryClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.bert = BERT.BERT(self.config)
        # classfier
        self.projection_cls = nn.Linear(self.config.d_model, self.config.n_output, bias=False)
    
    def forward(self, inputs, segments):
        # (bs, n_enc_seq, d_hidn), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        outputs, outputs_cls, attn_probs = self.bert(inputs, segments)
        # (bs, n_output)
        logits_cls = self.projection_cls(outputs_cls)
        # (bs, n_output), [(bs, n_head, n_enc_seq, n_enc_seq)]
        return logits_cls, attn_probs

In [4]:
class DataSet(torch.utils.data.Dataset):
    def __init__(self, vocab, infile):
        self.vocab = vocab
        self.labels = []
        self.sentences = []
        self.segments = []

        line_cnt = 0
        with open(infile, "r") as f:
            for line in f:
                line_cnt += 1

        with open(infile, "r") as f:
            for i, line in enumerate(tqdm(f, total=line_cnt, desc="Loading Dataset", unit=" lines")):
                data = json.loads(line)
                self.labels.append(data["label"])
                sentence = [vocab.piece_to_id("[CLS]")] + [vocab.piece_to_id(p) for p in data["doc"]] + [vocab.piece_to_id("[SEP]")]
                self.sentences.append(sentence)
                self.segments.append([0] * len(sentence))
    
    def __len__(self):
        assert len(self.labels) == len(self.sentences)
        assert len(self.labels) == len(self.segments)
        return len(self.labels)
    
    def __getitem__(self, item):
        return (torch.tensor(self.labels[item]),
                torch.tensor(self.sentences[item]),
                torch.tensor(self.segments[item]))

In [5]:
def data_collate_fn(inputs):
    labels, inputs, segments = list(zip(*inputs))

    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    segments = torch.nn.utils.rnn.pad_sequence(segments, batch_first=True, padding_value=0)

    batch = [
        torch.stack(labels, dim=0),
        inputs,
        segments,
    ]
    return batch

In [6]:
batch_size = 16
train_dataset = DataSet(vocab, f"ratings_train.json")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collate_fn)
test_dataset = DataSet(vocab, f"ratings_test.json")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collate_fn)

Loading Dataset: 100%|███████████| 149995/149995 [00:02<00:00, 54587.82 lines/s]
Loading Dataset: 100%|█████████████| 49997/49997 [00:00<00:00, 51444.83 lines/s]


In [7]:
def eval_epoch(config, model, data_loader):
    matchs = []
    model.eval()

    n_word_total = 0
    n_correct_total = 0
    with tqdm(total=len(data_loader), desc=f"Valid") as pbar:
        for i, value in enumerate(data_loader):
            labels, inputs, segments = map(lambda v: v.to(config.device), value)

            outputs = model(inputs, segments)
            logits_cls = outputs[0]
            _, indices = logits_cls.max(1)

            match = torch.eq(indices, labels).detach()
            matchs.extend(match.cpu())
            accuracy = np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

            pbar.update(1)
            pbar.set_postfix_str(f"Acc: {accuracy:.3f}")
    return np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

In [8]:
def train_epoch(config, epoch, model, criterion_cls, optimizer, train_loader):
    losses = []
    model.train()

    with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
        for i, value in enumerate(train_loader):
            labels, inputs, segments = map(lambda v: v.to(config.device), value)

            optimizer.zero_grad()
            outputs = model(inputs, segments)
            logits_cls = outputs[0]

            loss_cls = criterion_cls(logits_cls, labels)
            loss = loss_cls

            loss_val = loss_cls.item()
            losses.append(loss_val)

            loss.backward()
            optimizer.step()

            pbar.update(1)
            pbar.set_postfix_str(f"Loss: {loss_val:.3f} ({np.mean(losses):.3f})")
    return np.mean(losses)

In [9]:
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

In [10]:
config = Config({
    "n_enc_vocab": len(vocab), # vocab 크기
    "n_enc_seq": 512,          # 글자 최대 길이 
    "n_seg_type": 2,           # Segment Embedding Type  
    "n_layer":6,             # layer 캣수
    "d_model": 512,            # hidden layer 
    "i_pad": 0,                # padding 값
    "d_ff": 1024,              # feedforward layer에 들어갈 차원의 크기
    "n_head": 6,              # attention 개수
    "d_head": 64,              # attention 차원 
    "dropout": 0.1,            # dropout
    "layer_norm_epsilon": 1e-12 # 정규화
})
print(config)

{'n_enc_vocab': 32007, 'n_enc_seq': 512, 'n_seg_type': 2, 'n_layer': 6, 'd_model': 512, 'i_pad': 0, 'd_ff': 1024, 'n_head': 6, 'd_head': 64, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


In [11]:
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config.n_output = 2
print(config)

learning_rate = 5e-5
n_epoch = 10

{'n_enc_vocab': 32007, 'n_enc_seq': 512, 'n_seg_type': 2, 'n_layer': 6, 'd_model': 512, 'i_pad': 0, 'd_ff': 1024, 'n_head': 6, 'd_head': 64, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12, 'device': device(type='cuda'), 'n_output': 2}


In [12]:
def train(model):
    model.to(config.device)

    criterion_cls = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    best_epoch, best_loss, best_score = 0, 0, 0
    losses, scores = [], []
    for epoch in range(n_epoch):
        loss = train_epoch(config, epoch, model, criterion_cls, optimizer, train_loader)
        score = eval_epoch(config, model, test_loader)

        losses.append(loss)
        scores.append(score)

        if best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
    print(f">>>> epoch={best_epoch}, loss={best_loss:.5f}, socre={best_score:.5f}")
    return losses, scores

In [13]:
model = BinaryClassification(config)
save_pretrain = "save_bert_pretrain.pth"
model.bert.load(save_pretrain)
# losses, scores = train(model)

(17, 4.929350027394117)

In [None]:
losses, scores = train(model)

Train(0): 100%|████████| 9375/9375 [03:02<00:00, 51.45it/s, Loss: 0.557 (0.443)]
Valid: 100%|████████████████████| 3125/3125 [02:17<00:00, 22.77it/s, Acc: 0.824]
Train(1): 100%|████████| 9375/9375 [03:02<00:00, 51.48it/s, Loss: 0.479 (0.347)]
Valid: 100%|████████████████████| 3125/3125 [02:16<00:00, 22.94it/s, Acc: 0.836]
Train(2): 100%|████████| 9375/9375 [03:01<00:00, 51.55it/s, Loss: 0.231 (0.298)]
Valid: 100%|████████████████████| 3125/3125 [02:16<00:00, 22.84it/s, Acc: 0.849]
Train(3): 100%|████████| 9375/9375 [03:01<00:00, 51.62it/s, Loss: 0.079 (0.255)]
Valid: 100%|████████████████████| 3125/3125 [02:16<00:00, 22.92it/s, Acc: 0.848]
Train(4): 100%|████████| 9375/9375 [03:03<00:00, 51.01it/s, Loss: 0.084 (0.215)]
Valid:  74%|██████████████▊     | 2311/3125 [01:19<00:50, 16.27it/s, Acc: 0.850]