In [84]:
import yaml

with open("./config/config.yaml") as file:
    config = yaml.safe_load(file)

In [52]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import  BertTokenizer

class BERTDataset(Dataset):
    def __init__(self, tokenizer, corpus, labels, maxlen=300):
        super(BERTDataset, self).__init__()
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.inputs = [self.tokenize(text) for text in corpus]
        self.labels = [np.array(label) for label in labels]

    def tokenize(self, data):
        data = self.tokenizer(data, max_length=self.maxlen, padding="max_length", truncation=True,)
        return np.array(data['input_ids']), np.array(data['token_type_ids']), np.array(data['attention_mask'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [55]:
import torch
import torch.nn as nn
from transformers import BertModel


class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size, num_classes, dr_rate=0.5):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(p=dr_rate)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooler = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        output = self.dropout(pooler)
        output = self.classifier(output)
        return output

In [56]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("beomi/kcbert-base")
bert = BertModel.from_pretrained("beomi/kcbert-base", return_dict=False)

max_len = 512
batch_size = 4
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
learning_rate = 5e-5

df_token = pd.read_csv(config['path']['train'])
corpus = [t for t in df_token['text']]
label = to_categorical(LabelEncoder().fit_transform(df_token['label']))

train_dataset = BERTDataset(tokenizer, corpus, label, maxlen=128)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
from transformers.optimization import get_cosine_schedule_with_warmup

model = BERTClassifier(bert, hidden_size=768, num_classes=8)
model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [75]:
losses = []
for epoch in range(num_epochs):
    model.train()
    for batch_id, (inputs, label) in enumerate(tqdm(train_dataloader, ncols=0)):
        input_ids = inputs[0].long().to(device)
        token_type_ids = inputs[1].long().to(device)
        attention_mask = inputs[2].long().to(device)
        label = label.to(device)

        optimizer.zero_grad()
        out = model(input_ids, token_type_ids, attention_mask)
        loss = loss_fn(out, label)
        loss.backward()
        losses.append(loss.item())

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
    print(sum(losses)/len(losses))
    state = {'Epoch': epoch,
             'State_dict': model.state_dict(),
             'Optimizer': optimizer.state_dict()}

100% 549/549 [01:09<00:00,  7.87it/s]


0.06414136195514827


100% 549/549 [01:10<00:00,  7.84it/s]


0.075216880026873


100% 549/549 [01:07<00:00,  8.18it/s]


0.12975959540356968


100% 549/549 [01:10<00:00,  7.84it/s]


0.16630310353939887


100% 549/549 [01:04<00:00,  8.50it/s]

0.17875592456867345





In [85]:
df_test = pd.read_csv(config['path']['test'])
test_corpus = [t for t in df_test['text']]
test_label = to_categorical(LabelEncoder().fit_transform(df_test['label']))

test_dataset = BERTDataset(tokenizer, test_corpus, test_label, maxlen=128)
test_dataloader = DataLoader(test_dataset, batch_size=1, num_workers=4, shuffle=False)

In [87]:
model.eval()
fail = []
with torch.no_grad():
    for idx, (inputs, label) in enumerate(tqdm(test_dataloader, ncols=0)):
        input_ids = inputs[0].long().to(device)
        token_type_ids = inputs[1].long().to(device)
        attention_mask = inputs[2].long().to(device)

        out = model(input_ids, token_type_ids, attention_mask)
        pred = np.argmax(out.cpu().detach().numpy(), axis=1).item()
        label = np.argmax(label, axis=1).item()
        if pred != label: fail.append(idx)

100% 942/942 [00:19<00:00, 48.03it/s]


In [88]:
print(len(fail)/len(test_dataset))

0.1602972399150743


In [89]:
df_test.iloc[fail]

Unnamed: 0,text,label
4,정부 가일 부터 코로나 로 피해 를 입은소 상공 인 과 고용 취약 계층 에 버팀목 ...,society
19,한국 쇼트트랙 간판 선수 심석희 를 성폭행 한 혐의 로 기소 된 조 재범 전 쇼트트...,society
20,여자 쇼트트랙 국가대표 심석희 를 상습 성폭행 한 혐의 로 기소 된 조 재범 전 국...,society
36,화성시 가 봄철 미세먼지 계절 관리제 기간 을 맞아이 달말까지 농촌 지역 폐기물 불...,politics
49,앙겔라 메르켈 독일 총리 가 지난 일 현 지 시간 백신 생산 업체 및 주 총리 들 ...,economy
...,...,...
923,웹툰 을 서비스 하 는 카카오페이지 와 음원 및 드라마 제작 까지 아우르는 카카오 ...,entertain
933,오늘 군 에서는 코로나 신규 확 진자 가명 발생 해 누적 확 진자 가명 으로 늘어났...,international
936,행정구역 합치는 특별 지자체 적 극지 원 사견 전 제 경기도 는 남북 으로 나눠야 ...,society
937,일 본 수도권 일부 지역 에 신종 코로나바이러스 감염증 코로나 긴급 사태 가 발령 ...,society
