In [1]:
# SKT Brain github 주소는 다음과 같습니다. https://github.com/SKTBrain/KoBERT

# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers==3 # 최신 버전으로 설치하면 "Input: must be Tensor, not str" 라는 에러 발생
# !pip install torch

# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

# kobert 
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

# transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import pandas as pd

In [2]:
# GPU 사용 시
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Bert모델, Voca 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

# cuda device 확인
torch.cuda. empty_cache()

#GPU 디바이스의 갯수
print ('Available devices ', torch.cuda.device_count())
#현재 셋업 되어있는 GPU 넘버
print ('Current cuda device ', torch.cuda.current_device())
#GPU 디바이스의 이름
print(torch.cuda.get_device_name(device))

using cached model
using cached model
Available devices  1
Current cuda device  0
Tesla P100-PCIE-16GB


In [3]:
# path 설정
PATH = '../korean/'

In [4]:
# csv 파일 받아와서 샘플 10개 뽑아보기
new_data = pd.read_csv(PATH + 'sdata11.csv')
new_data.sample(n=10)

Unnamed: 0,Title_Article,sentiment
218986,기준치 초과 어린이 가죽 지갑 제품 리콜납 기준 초과 어린이 가죽 지갑 안전 기준 ...,0
340360,롯데건설 층간 소음 기술 개발 공동 연구 협약 추진 경량 중량 충격음 모두 완충재 ...,1
164392,올댓차이 중국 증시 혼조 마감 서울 문예 중국 증시 혼조 마감 이날 상하 종합 지수...,0
256774,화두 증세 인플레 빅테크 실적 주목 월가 시각머니 뉴욕 임동욱 특파원 사람 법인세 ...,1
89345,보드 업계 법규제 완화 이상 서비스내달 이용가 법개정 주요 업체 자발 연령 제한 최...,0
88295,한전 올해 공정 거래 자율 준수 프로그램 운영한국전력 지속 자율 공정 거래 준수 체...,1
223347,육박 추월 나랏빚 회계 연도 국가 결산 국민 만원 나라살림 관리 재정 수지 적자 역...,0
66230,리테일 공정위 장기 점포 상생 협약 가맹점 파트너십 강화 김아 리테일 서울시 중구 ...,1
340838,현장 연결 부동산 투기 예방 적발 일벌 환수 대책 마련 정부 조금 홍남기 경제 부총...,0
12386,수출 이제 디지털 대전환 코트라 빅데이터 이코리아 간담회코 지나 서울 삼성동 디지털...,1


In [5]:
# data list 만들어 sentiment 숫자값에서 str으로 변환시켜주기(모델 들어가기위해)

data_list = []
for q, label in zip(new_data['Title_Article'], new_data['sentiment'])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)
    
# 만든 data list 형변환 확인
type(data_list[0][1])

str

In [6]:
# Train / Test set 분리
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(data_list, test_size=0.2, random_state=42)

# data 사이즈 확인
print("train shape is:", len(dataset_train))
print("test shape is:", len(dataset_test))
print("total shape is:", len(dataset_train)+len(dataset_test))

train shape is: 311592
test shape is: 77899
total shape is: 389491


In [7]:
# kobert 입력 데이터로 만들기

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) 

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [8]:
# Setting parameters 파라미터 세팅
max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 9 # 훈련 반복횟수
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [9]:
# 토큰화
# 기본 Bert tokenizer 사용
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [10]:
# data 패딩화

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [11]:
# 패딩화 잘 되었는지 확인

data_train[0]

#첫 번째는 패딩된 시퀀스
#두 번째는 길이와 타입에 대한 내용
#세 번재는 어텐션 마스크 시퀀스

(array([   2,  517, 6147, 3758, 6510, 7489, 7947, 4152, 4768, 2785, 5678,
        2155, 4768,  517, 6591, 6553, 7209, 6879,  517, 6591, 7533, 4768,
        3533, 5550, 1633, 6280, 2650, 6951, 6896, 6664, 6837, 1370, 5725,
        6432, 2169, 7619, 1267, 4768, 6874, 3468, 6198, 4152, 2337, 2734,
        7234, 2238, 2822, 6406, 1132, 6516, 1211, 5753, 5954, 2195, 2442,
        1283, 3902, 2991,  517, 7922, 6812,  517, 6147,    3], dtype=int32),
 array(64, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 1)

In [12]:
# torch 형식의 dataset 만들어주기 (pytorch용 DataLoader 사용)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [13]:
# Kobert 학습모델 만들기

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2, # softmax 사용 <- binary일 경우는 2
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
      

In [14]:
# Bert 모델 불러오기

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# optimizer설정 하고 schedule 설정(linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능
#loss_fn = nn.BCEWithLogitsLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)


In [15]:
# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fe664740898>

In [16]:
# 모델 학습 시작
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # gradient clipping
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length 
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7361290454864502 train acc 0.484375
epoch 1 batch id 201 loss 0.6084112524986267 train acc 0.5689521144278606
epoch 1 batch id 401 loss 0.4131428897380829 train acc 0.6837593516209476
epoch 1 batch id 601 loss 0.4295541048049927 train acc 0.7376767886855241
epoch 1 batch id 801 loss 0.2976975739002228 train acc 0.7680633583021224
epoch 1 batch id 1001 loss 0.41340136528015137 train acc 0.7902878371628371
epoch 1 batch id 1201 loss 0.2814720869064331 train acc 0.80564373438801
epoch 1 batch id 1401 loss 0.30909910798072815 train acc 0.817396056388294
epoch 1 batch id 1601 loss 0.16753767430782318 train acc 0.8270612117426608
epoch 1 batch id 1801 loss 0.16925112903118134 train acc 0.8352477790116601
epoch 1 batch id 2001 loss 0.14112988114356995 train acc 0.8417275737131434
epoch 1 batch id 2201 loss 0.3019237816333771 train acc 0.847320820081781
epoch 1 batch id 2401 loss 0.21028688549995422 train acc 0.8514746459808413
epoch 1 batch id 2601 loss 0.10534945130

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 1 test acc 0.9197349884311092


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.3768395185470581 train acc 0.84375
epoch 2 batch id 201 loss 0.33471664786338806 train acc 0.9137904228855721
epoch 2 batch id 401 loss 0.3154512047767639 train acc 0.9121726932668329
epoch 2 batch id 601 loss 0.2415381520986557 train acc 0.913321547420965
epoch 2 batch id 801 loss 0.25615227222442627 train acc 0.9137211298377028
epoch 2 batch id 1001 loss 0.19419129192829132 train acc 0.9149912587412588
epoch 2 batch id 1201 loss 0.22168266773223877 train acc 0.9155651540383014
epoch 2 batch id 1401 loss 0.18514472246170044 train acc 0.9157521413276232
epoch 2 batch id 1601 loss 0.10047618299722672 train acc 0.9160680824484697
epoch 2 batch id 1801 loss 0.1520732194185257 train acc 0.9166435313714603
epoch 2 batch id 2001 loss 0.11278922855854034 train acc 0.9171273738130935
epoch 2 batch id 2201 loss 0.17684194445610046 train acc 0.9179279304861426
epoch 2 batch id 2401 loss 0.2270849496126175 train acc 0.9183413161182841
epoch 2 batch id 2601 loss 0.0871805

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 2 test acc 0.9269188964770863


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.32480111718177795 train acc 0.90625
epoch 3 batch id 201 loss 0.22982077300548553 train acc 0.9402207711442786
epoch 3 batch id 401 loss 0.2583850920200348 train acc 0.9399158354114713
epoch 3 batch id 601 loss 0.18454599380493164 train acc 0.9410357737104825
epoch 3 batch id 801 loss 0.16833855211734772 train acc 0.9416549625468165
epoch 3 batch id 1001 loss 0.11668340116739273 train acc 0.9422296453546454
epoch 3 batch id 1201 loss 0.09007201343774796 train acc 0.9429121565362198
epoch 3 batch id 1401 loss 0.1510545015335083 train acc 0.9432994289793005
epoch 3 batch id 1601 loss 0.11801674216985703 train acc 0.9435899437851343
epoch 3 batch id 1801 loss 0.15027806162834167 train acc 0.9440762076624097
epoch 3 batch id 2001 loss 0.06770829856395721 train acc 0.9446995252373813
epoch 3 batch id 2201 loss 0.10404449701309204 train acc 0.945280554293503
epoch 3 batch id 2401 loss 0.08755512535572052 train acc 0.9457517700957934
epoch 3 batch id 2601 loss 0.0477

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 3 test acc 0.9262389908941634


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.18207429349422455 train acc 0.953125
epoch 4 batch id 201 loss 0.1955815702676773 train acc 0.9598880597014925
epoch 4 batch id 401 loss 0.15775258839130402 train acc 0.9610738778054863
epoch 4 batch id 601 loss 0.02484392374753952 train acc 0.9611584858569051
epoch 4 batch id 801 loss 0.15698514878749847 train acc 0.9620396379525593
epoch 4 batch id 1001 loss 0.022893249988555908 train acc 0.9629120879120879
epoch 4 batch id 1201 loss 0.016248833388090134 train acc 0.9637021232306411
epoch 4 batch id 1401 loss 0.09859801083803177 train acc 0.964143915060671
epoch 4 batch id 1601 loss 0.024733737111091614 train acc 0.9644167707682698
epoch 4 batch id 1801 loss 0.05640704184770584 train acc 0.9650801637978901
epoch 4 batch id 2001 loss 0.04003564268350601 train acc 0.965392303848076
epoch 4 batch id 2201 loss 0.060480229556560516 train acc 0.9655980236256247
epoch 4 batch id 2401 loss 0.0348355658352375 train acc 0.9658085172844648
epoch 4 batch id 2601 loss 0.

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 4 test acc 0.9320630877742947


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.11292776465415955 train acc 0.984375
epoch 5 batch id 201 loss 0.032543398439884186 train acc 0.9770677860696517
epoch 5 batch id 401 loss 0.10513468831777573 train acc 0.9757637157107232
epoch 5 batch id 601 loss 0.009048526175320148 train acc 0.9762635191347754
epoch 5 batch id 801 loss 0.12533579766750336 train acc 0.9769233770287141
epoch 5 batch id 1001 loss 0.038896266371011734 train acc 0.977022977022977
epoch 5 batch id 1201 loss 0.059986911714076996 train acc 0.9774146544546212
epoch 5 batch id 1401 loss 0.05404357239603996 train acc 0.9776498929336188
epoch 5 batch id 1601 loss 0.009149052202701569 train acc 0.9779141942535915
epoch 5 batch id 1801 loss 0.03483099490404129 train acc 0.9782239033870073
epoch 5 batch id 2001 loss 0.07167820632457733 train acc 0.978604447776112
epoch 5 batch id 2201 loss 0.01011965423822403 train acc 0.978851942298955
epoch 5 batch id 2401 loss 0.03113763965666294 train acc 0.9790647126197418
epoch 5 batch id 2601 loss 

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 5 test acc 0.9328712774294672


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.1611035019159317 train acc 0.953125
epoch 6 batch id 201 loss 0.004268002696335316 train acc 0.9847636815920398
epoch 6 batch id 401 loss 0.033363405615091324 train acc 0.9849205112219451
epoch 6 batch id 601 loss 0.05429176986217499 train acc 0.985648918469218
epoch 6 batch id 801 loss 0.007900716736912727 train acc 0.9857599875156055
epoch 6 batch id 1001 loss 0.00561200175434351 train acc 0.9857174075924076
epoch 6 batch id 1201 loss 0.010710650123655796 train acc 0.9856890091590341
epoch 6 batch id 1401 loss 0.022207247093319893 train acc 0.9860256067094932
epoch 6 batch id 1601 loss 0.002719887299463153 train acc 0.9864147407870081
epoch 6 batch id 1801 loss 0.0923747569322586 train acc 0.9866480427540255
epoch 6 batch id 2001 loss 0.12859444320201874 train acc 0.9867410044977512
epoch 6 batch id 2201 loss 0.0016298091504722834 train acc 0.9869022603362109
epoch 6 batch id 2401 loss 0.06406717747449875 train acc 0.9870691899208663
epoch 6 batch id 2601 lo

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 6 test acc 0.9370405097775788


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.16176316142082214 train acc 0.96875
epoch 7 batch id 201 loss 0.041600652039051056 train acc 0.9908271144278606
epoch 7 batch id 401 loss 0.05550410971045494 train acc 0.9909600997506235
epoch 7 batch id 601 loss 0.03635244071483612 train acc 0.9911605657237936
epoch 7 batch id 801 loss 0.028923071920871735 train acc 0.9913779650436954
epoch 7 batch id 1001 loss 0.0026145349256694317 train acc 0.9914928821178821
epoch 7 batch id 1201 loss 0.004382371436804533 train acc 0.991790695253955
epoch 7 batch id 1401 loss 0.0021361028775572777 train acc 0.9919811741613134
epoch 7 batch id 1601 loss 0.0018516301643103361 train acc 0.9921143035602749
epoch 7 batch id 1801 loss 0.0013931415742263198 train acc 0.9924000555247084
epoch 7 batch id 2001 loss 0.0016633388586342335 train acc 0.9924100449775113
epoch 7 batch id 2201 loss 0.0013968677958473563 train acc 0.9924253180372558
epoch 7 batch id 2401 loss 0.0003874879621434957 train acc 0.9925486776343191
epoch 7 batch 

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 7 test acc 0.9378230426183013


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.15451738238334656 train acc 0.96875
epoch 8 batch id 201 loss 0.001598687726072967 train acc 0.9951803482587065
epoch 8 batch id 401 loss 0.011998903006315231 train acc 0.9951683291770573
epoch 8 batch id 601 loss 0.001303145312704146 train acc 0.9953982945091514
epoch 8 batch id 801 loss 0.0008703926578164101 train acc 0.9956694756554307
epoch 8 batch id 1001 loss 0.0006033435929566622 train acc 0.9957854645354646
epoch 8 batch id 1201 loss 0.0004989694571122527 train acc 0.9960579725228976
epoch 8 batch id 1401 loss 0.00044005116797052324 train acc 0.9959403997144897
epoch 8 batch id 1601 loss 0.00145623367279768 train acc 0.9959888351030606
epoch 8 batch id 1801 loss 0.0006515547283925116 train acc 0.9960351887840089
epoch 8 batch id 2001 loss 0.0006973931449465454 train acc 0.9960957021489255
epoch 8 batch id 2201 loss 0.04360019415616989 train acc 0.9961665152203544
epoch 8 batch id 2401 loss 0.0003713849000632763 train acc 0.996225531028738
epoch 8 batch

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 8 test acc 0.9405939785788925


  0%|          | 0/4869 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.1834559589624405 train acc 0.96875
epoch 9 batch id 201 loss 0.0005235691787675023 train acc 0.9971237562189055
epoch 9 batch id 401 loss 0.10129231959581375 train acc 0.99696072319202
epoch 9 batch id 601 loss 0.0006098419544287026 train acc 0.9972181780366056
epoch 9 batch id 801 loss 0.0005149913486093283 train acc 0.997269038701623
epoch 9 batch id 1001 loss 0.0008067622547969222 train acc 0.9973464035964036
epoch 9 batch id 1201 loss 0.0018036914989352226 train acc 0.997463051623647
epoch 9 batch id 1401 loss 0.000481476541608572 train acc 0.9974906316916489
epoch 9 batch id 1601 loss 0.00034411842352710664 train acc 0.9976284353529045
epoch 9 batch id 1801 loss 0.00046695146011188626 train acc 0.9977269572459745
epoch 9 batch id 2001 loss 0.00048422414693050086 train acc 0.9977901674162919
epoch 9 batch id 2201 loss 0.00036519451532512903 train acc 0.9977850976828714
epoch 9 batch id 2401 loss 0.00035326540819369256 train acc 0.99777436484798
epoch 9 bat

  0%|          | 0/1218 [00:00<?, ?it/s]

epoch 9 test acc 0.9406068069861174


In [17]:
# 저장하기
torch.save(model, PATH + 'model11.pt')  # 전체 모델 저장
torch.save(model.state_dict(), PATH + 'model_state_dict11.pt')  # 모델 객체의 state_dict 저장
# torch.save({
#     'model': model.state_dict(),
#     'optimizer': optimizer.state_dict()
#     'loss_fn' : loss_fn.state_dict()
#     't_total' : t_total.state_dict()
#     'warmup_step' : warmup_step.state_dict()
#     'scheduler' : scheduler.state_dict()
    
# }, PATH + 'ver6.tar')  # 여러 가지 값 저장, 학습 중 진행 상황 저장을 위해 epoch, loss 값 등 일반 scalar값 저장 가능



In [18]:
# 감성값 예측하는 함수 만들기

import pandas as pd

# 위에서 설정한 tok, max_len, batch_size, device를 그대로 입력
# comment : 예측하고자 하는 텍스트 데이터 리스트
def getSentimentValue(comment, tok, max_len, batch_size, device):
    commnetslist = [] # 텍스트 데이터를 담을 리스트
    emo_list = [] # 감성 값을 담을 리스트
    for c in comment: # 모든 댓글
        commnetslist.append( [c, 5] ) # [댓글, 임의의 양의 정수값] 설정

    pdData = pd.DataFrame( commnetslist, columns = [['뉴스', '감성']] )
    pdData = pdData.values
    test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
    test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length 
        # 이때, out이 예측 결과 리스트
        
        out = model(token_ids, valid_length, segment_ids)
        print(out)
        # e는 2가지 실수 값으로 구성된 리스트
        # 0번 인덱스가 더 크면 부정, 긍정은 반대
        for e in out:
            if e[0]>e[1]: # 부정
                value = 0
                emo_list.append("부정")
                print('부정')
            else: #긍정
                value = 1
                emo_list.append("긍정")
                print('긍정')
                

    return emo_list # 텍스트 데이터에 1대1 매칭되는 감성값 리스트 반환

# input : 텍스트 데이터 리스트 외 KoBERT 설정 파라미터들
# output : 입력한 텍스트 데이터 리스트와 1대 1 매칭 되는 감성 값 리스트

In [19]:
# 뉴스기사 테스트 함수
def news():
    
    comment = []
    comment.append(input("원하는 기사를 입력하세요\n\n"))

    for c in comment:
        print(f'\n기사 : {c}\n')
        
    return getSentimentValue(comment, tok, max_len, batch_size, device)

In [20]:
news()

원하는 기사를 입력하세요

DD

기사 : DD

tensor([[ 4.4583, -3.8848]], device='cuda:0', grad_fn=<AddmmBackward>)
부정


['부정']

In [21]:
model.eval() # 평가 모드로 변경
    
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    test_acc += calc_accuracy(out, label)
print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/1218 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model = torch.load(PATH + 'model.pt')  # 전체 모델을 통째로 불러옴, 클래스 선언 필수
model.load_state_dict(torch.load(PATH + 'model_state_dict.pt'))  # state_dict를 불러 온 후, 모델에 저장

checkpoint = torch.load(PATH + 'all.tar')   # dict 불러오기
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

In [None]:
# 학습용 호출
model.train()

# 사용용 호출
model.eval()