In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)
print(device)

sent_key, label_key = 'document', 'label'

digital = ["internet", "science", "game", "it", "device", "mobile", "software", "others"]
society = ["affair", "people", "education", "media", "woman", "welfare", "others", "labor", "environment"]
economic = ["finance", "industry", "employ", "others", "autos", "stock", "estate", "consumer", "world"]
culture = ["health", "life", "art", "book", "leisure", "others", "weather", "fashion", "home", "food", "religion"]

labels = {"digital": digital, "society": society, "economic": economic, "culture": culture}

label2idx = {0: "digital", 1: "society", 2: "economic", 3: "culture"}


cpu


In [3]:
class BERTSentenceTransform:

    def __init__(self, tokenizer, max_seq_length, pad=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad

    def __call__(self, sent):
        sent_tokens = self._tokenizer.tokenize(sent)
        if len(sent_tokens) > self._max_seq_length - 2:
            sent_tokens = sent_tokens[0:(self._max_seq_length - 2)]

        tokens = []
        tokens.append("[CLS]")
        tokens.extend(sent_tokens)
        tokens.append("[SEP]")

        segment_ids = [0] * len(tokens)
        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
        valid_length = len(input_ids)

        if self._pad:
            padding_length = self._max_seq_length - valid_length
            input_ids.extend([1] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 4,
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [4]:
config = dict(
    max_len=100,
    batch_size=64,
    warmup_ratio=0.1,
    num_epochs=3,
    max_grad_norm=1,
    log_interval=200,
    learning_rate=5e-5,
)

topic_name = label2idx[3]
topic = labels[topic_name]
label_num = len(topic)

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained("skt/kobert-base-v1")
bertmodel.to(device)

model = BERTClassifier(bertmodel, dr_rate = 0.5).to(device)
model = BERTClassifier(bertmodel, num_classes=label_num, dr_rate = 0.5).to(device)
transform = BERTSentenceTransform(tokenizer, max_seq_length=config["max_len"], pad=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [5]:
def predict(predict_sentence):
    token_ids, valid_length, segment_ids = transform(predict_sentence)

    model.eval()
    token_ids = torch.tensor([token_ids]).to(device)
    segment_ids = torch.tensor(segment_ids).to(device)

    out = model(token_ids, [valid_length], segment_ids)

    prob = F.softmax(out, dim=1)[0]
    for i in range(label_num):
        print(f"{topic[i]}: {prob[i] * 100:.2f}%")

    test_eval = []
    for i in out:
        logits = i
        logits = logits.detach().cpu().numpy()

        test_eval.append(topic[np.argmax(logits)])

    print(">> 입력하신 기사는 " + test_eval[0] + " 기사입니다.")

In [50]:
predict(news_art)


health: 6.89%
life: 9.93%
art: 11.54%
book: 6.36%
leisure: 10.40%
others: 7.74%
weather: 5.81%
fashion: 12.63%
home: 11.73%
food: 8.56%
religion: 8.41%
>> 입력하신 기사는 fashion 기사입니다.


In [6]:
# 질문에 0 입력 시 종료

while True:
    sentence = input("분류하고 싶은 기사를 입력해주세요 (종료하려면 0 입력) : ")
    if sentence == "0" :
        break
    predict(sentence)
    print("\n")

  token_ids = torch.tensor([token_ids]).to(device)


health: 8.59%
life: 10.99%
art: 7.99%
book: 10.93%
leisure: 7.89%
others: 9.20%
weather: 4.50%
fashion: 8.05%
home: 18.66%
food: 5.49%
religion: 7.69%
>> 입력하신 기사는 home 기사입니다.




In [16]:
a = torch.tensor([1,2,3,4])
b = torch.tensor([1,2,3,4])
torch.cat((a, b))



tensor([1, 2, 3, 4, 1, 2, 3, 4])

In [17]:
from transformers import BertForSequenceClassification
model_path = "uine/1kobert-article-economic-classifier"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=9,ignore_mismatched_sizes=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at uine/1kobert-article-economic-classifier and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([9]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [32]:
model.bert.embeddings.word_embeddings.num_embeddings = 8002
model.bert.embeddings.word_embeddings.padding_idx = 1
model.dropout.p = 0.5
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [25]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_sample',
 '_beam_search',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_expand_inputs_for_generation',
 