# 사용 라이브러리 설치 및 import

## pytorch tpu를 사용하기 위한 라이브러리 세팅
- PyTorch 에서 .to(device) 문법을 통해 텐서 변수들과 모델들을 GPU or TPU 같은 device에 올릴 수 있다.
- TPU 올리기 위해서는 torch_xls에서 제공하는 xm.xla_device()를 통해 PyTorch와 호환되는 device를 지정할 수 있다.

In [None]:
# PyTorch/XLA 설치
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

- torch_xla 설치하고 import하면 error가 등장한다.
- 이를 해결하기 위해서는 pytorch version을 downgrade 해주면 된다.(torch-1.8.2+cpu)

In [None]:
!pip uninstall -y torch
!pip install torch==1.8.2+cpu torchvision==0.9.2+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html

- version 호환성 맞춰주기 위해 torch-xla-1.8 다시 설치

In [None]:
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

In [3]:
import torch_xla
import torch_xla.core.xla_model as xm

## KoBERT 사용하기 위한 라이브러리 세팅

In [None]:
# 필요한 라이브러리 설치
!pip install mxnet
!pip install gluonnlp
!pip install transformers==3.0.2

In [None]:
# KoBERT 설치
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [8]:
# KoBERT 사용 위한 라이브러리
import os
import sys
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

# kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

# transformaer
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# 구글 드라이브, 코랩, GCP, TPU 환경 세팅

In [14]:
# 구글 드라이브 - colab 연동
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 155335 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.27-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [15]:
# google drive - colab 연동
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()

import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()

!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [16]:
# Gdrive 폴더 및 colab 연동
!mkdir -p Gdrive

!google-drive-ocamlfuse Gdrive -o nonempty

!ls

adc.json  Gdrive  sample_data


# 데이터 가공 및 준비

In [30]:
import pandas as pd

FILE_PATH = '/content/Gdrive/ONe/db/Q&A_개발활용가능_2020.10_sample.xlsx'
df = pd.read_excel(FILE_PATH)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   번호           1 non-null      float64
 1   종류           1068 non-null   object 
 2   대분류          1068 non-null   object 
 3   중분류          1068 non-null   object 
 4   소분류          578 non-null    object 
 5   Question     1068 non-null   object 
 6   Answer       1065 non-null   object 
 7   Hashtag 2    1061 non-null   object 
 8   Unnamed: 8   4 non-null      object 
 9   건강/의료        1 non-null      float64
 10  환경/생활        1 non-null      float64
 11  교육/심리        1 non-null      float64
 12  단순문의         1 non-null      float64
 13  반려상품         1 non-null      float64
 14  법률           1 non-null      float64
 15  입양/장례        1 non-null      float64
 16  Unnamed: 16  0 non-null      float64
 17  총합           1 non-null      float64
dtypes: float64(10), object(8)
memory usage: 150.3+ K

In [None]:
df.head()

In [None]:
df.tail()

In [31]:
# 컬럼명 변경 (종류)
re_columns = ['번호', '종류', '대분류', '중분류', '소분류', 'Question',
       'Answer', 'Hashtag 2', 'Unnamed: 8', '건강/의료', '환경/생활', '교육/심리',
       '단순문의', '반려상품', '법률', '입양/장례', 'Unnamed: 16', '총합']
df.columns = re_columns

df.head(2)

Unnamed: 0,번호,종류,대분류,중분류,소분류,Question,Answer,Hashtag 2,Unnamed: 8,건강/의료,환경/생활,교육/심리,단순문의,반려상품,법률,입양/장례,Unnamed: 16,총합
0,1.0,개,건강/의료,기타,,감기 걸린 강아지도 산책을 시켜야 하나요?,강아지에게 산책은 중요한 부분이지만 컨디션이 저조하거나 감기에 걸린 채로 나가는 건...,"#강아지, #감기, #산책, #놀이",,319.0,265.0,317.0,73.0,21.0,3.0,70.0,,1068.0
1,,개,건강/의료,기타,,암컷 강아지의 생리와 혈뇨는 어떻게 구분하나요?,생리도 혈뇨도 피가 나오기 때문에 혼동이 올 수 있어요. 하지만 두 증상에 확실한 ...,"#강아지, #생리, #혈뇨, #출혈, #차이점, #자궁축농증, #배뇨",,,,,,,,,,


In [None]:
# 카테고리 별 분류 파싱하여 카테고리 트리 생성

category_dict = {}
category_list = []

category_list.append(0)
idx=1

for i, row in df.iterrows() :

    if row['종류'] != '개' and row['종류'] != '고양이':
        continue
    if row['종류'] not in category_dict :
        category_dict[row['종류']] = {}
    if row['대분류'] not in category_dict[row['종류']]:
        category_dict[row['종류']][row['대분류']] = {}
    if row['중분류'] not in category_dict[row['종류']][row['대분류']]:
        category_dict[row['종류']][row['대분류']][row['중분류']] = {}
    if row['소분류'] not in category_dict[row['종류']][row['대분류']][row['중분류']]:
        category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']] = idx

        category_list.append(str(row['종류']) + ':' + str(row['대분류']) + ':' + str(row['중분류']) + ':' + str(row['소분류']))
        idx += 1

print(category_dict)
print(category_list)
print(len(category_list))

In [33]:
NUM_LABELS = len(category_list)
# NUM_LABELS = 76

In [35]:
import re

PATTERN = '[^\w\s]'
PATTERN_MULTI_SPACE = ' +'
PATTERN_LINE_BREAKER = '\n|\r'
REPL = " "

def isNaN(string):
    return string != string

train_raw = {
    'label': [],
    'alpha': [],
    'text': []
}

continued_value = {
    '종류' : [],
    '대분류' : [],
    '중분류' : [],
    '소분류' : [],
    'Question' : []
}

# Question
for i, row in df.iterrows():
    if isNaN(row['Question']) : row['Question'] = REPL
    if row['종류'] != '개' and row['종류'] != '고양이' and row['Question'] != '' and row['Question'] != 'None':
        continue
    else:
        try:
            qu = re.sub(pattern=PATTERN, repl=REPL, string=str(row['Question']))
            qu = re.sub(pattern=PATTERN_LINE_BREAKER, repl=REPL, string=qu)
            qu = re.sub(pattern=PATTERN_MULTI_SPACE, repl=REPL, string=qu)

            if(qu != REPL) : qu = qu.strip()
            train_raw['label'].append(category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']])
            train_raw['alpha'].append('a')
            train_raw['text'].append(qu)
        
        except:
            continued_value['종류'].append(row['종류'])
            continued_value['대분류'].append(row['대분류'])
            continued_value['중분류'].append(row['중분류'])
            continued_value['소분류'].append(row['소분류'])
            continued_value['Question'].append(row['Question'])

# Answer
# for i, row in df.iterrows():
#     if isNaN(row['Answer']): 
#         row['Answer'] = REPL
#     if row['종류'] != '개' and row['종류'] != '고양이' and row['Answer'] != '' and row['Answer'] != 'None':
#         continue
#     else :
#         try:
#             qu = re.sub(pattern=PATTERN, repl=REPL, string=str(row['Answer']))
#             qu = re.sub(pattern=PATTERN_LINE_BREAKER, repl=REPL, string=qu)
#             qu = re.sub(pattern=PATTERN_MULTI_SPACE, repl=REPL, string=qu)
#             if(qu != REPL) : qu = qu.strip()
#             qu = qu[:256] if len(qu) > 256 else qu 
#             train_raw['label'].append(category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']])
#             train_raw['alpha'].append('a')
#             train_raw['text'].append(qu)
#         except:
#             continued_value['종류'].append(row['종류'])
#             continued_value['대분류'].append(row['대분류'])
#             continued_value['중분류'].append(row['중분류'])
#             continued_value['소분류'].append(row['소분류'])
#             continued_value['Question'].append(row['Question'])

train =pd.DataFrame(train_raw)

train = train.reset_index() # ==> index 재설정
train = train.rename(columns={'index':'id'}) # index라는 column을 id로 변동

# 에러 난 값들
error_values = pd.DataFrame(continued_value)

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1068 non-null   int64 
 1   label   1068 non-null   int64 
 2   alpha   1068 non-null   object
 3   text    1068 non-null   object
dtypes: int64(2), object(2)
memory usage: 33.5+ KB


In [38]:
train

Unnamed: 0,id,label,alpha,text
0,0,1,a,감기 걸린 강아지도 산책을 시켜야 하나요
1,1,1,a,암컷 강아지의 생리와 혈뇨는 어떻게 구분하나요
2,2,1,a,강아지도 생리를 하나요
3,3,1,a,강아지가 커피를 먹어도 되나요
4,4,1,a,강아지는 방구를 참지 못하나요
...,...,...,...,...
1063,1063,74,a,고양이 발정기 증상이 뭐가 있나요
1064,1064,74,a,고양이가 계속 우는데 발정기가 온건가요
1065,1065,74,a,고양이 비만인가요 비만인데 어떻게 관리하면 될까요
1066,1066,74,a,고양이 비만인것 같은데 어떻게 관리하면 될까요


In [39]:
# pytorch에 맞는 dataset 만들기
data_list = []
for text, label in zip(train['text'], train['label']):
    data = []
    data.append(text)
    data.append(str(label))

    data_list.append(data)

In [41]:
# 데이터 확인
print(len(data_list))
print(data_list[0])
print(data_list[600])
print(data_list[100])
print(data_list[-1])

1068
['감기 걸린 강아지도 산책을 시켜야 하나요', '1']
['강아지를 키울 때 주의해야 하는 건 어떤 게 있나요', '39']
['새끼강아지 유치는 언제 빠질까요', '10']
['고양이를 키울 때 주의해야 하는 건 어떤 게 있나요', '75']


In [42]:
# 데이터 분리
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(data_list, test_size=0.1, random_state=0)

print(len(dataset_train))
print(len(dataset_test))

961
107


# KoBERT 학습 모델

In [49]:
# 데이터셋 구축 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, text_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair
        )
        self.sentences=[transform([i[text_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [48]:
# 파라미터 세팅
max_len = 64
batch_size = 16
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5

In [45]:
# 토크나이저
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [50]:
# KoBERT 모델에 input하기 위한 데이터셋 tranform
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [54]:
# 데이터 확인
data_train[0]

(array([   2, 3732, 6730, 3245, 2986, 7811, 1772, 5591, 6999,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(10, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 74)

In [55]:
# 데이터 로더
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [57]:
# BERT 분류기 클래스 생성
class BERTClassifier(nn.Module):
    def __init__(self, 
                 bert, 
                 hidden_size=768,
                 num_classes=NUM_LABELS, # ==> label 개수
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, 
                              token_type_ids=segment_ids.long(), 
                              attention_mask=attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
# model load
bert_model, vocab = get_pytorch_kobert_model()

# TPU 세팅
device = xm.xla_device()


In [None]:
# bert 모델 불러오기
model = BERTClassifier(bert_model, dr_rate=0.5).to(device)

# optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params':[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':0.01},
    {'params':[p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 측정을 위한 함수 정의
def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices==Y).sum().data.cpu().numpy() / max_indices.size()[0]
    return train_acc

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        # tpu 사용을 위한 optimizer re setting
        # optimizer.step() ==> 기존 코드
        xm.optimizer_step(optimizer,barrier=True) # ==> TPU 쓸때 필요한 코드!!

        scheduler.step() # ==> update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print('epoch {} batch id {} loss {} train acc {}'.format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print('epoch {} train acc {}'.format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print('epoch {} test acc {}'.format(e+1, test_acc/(batch_id+1)))
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/61 [00:00<?, ?it/s]

  cpuset_checked))


epoch 1 batch id 1 loss 4.636133670806885 train acc 0.0


# KoBERT 학습 저장

In [None]:
# 학습 모델 google drive에 임시 저장

! mkdir kobert_model

torch.save(model, './kobert_model/KoBERT_test.pt')
torch.save(model.state_dict(), './kobert_model/model_state_dict.pt')
torch.save({
    'model' : model.state_dict(),
    'optimizer' : optimizer.state_dict()
}, './kobert_model/all.tar')

In [None]:
# 위 모델 파일을 Google Cloud Storage Bucket에 저장
GDRIVE_PATH = '/content/kobert_model/'
BUCKET_PATH = 'gs://kobert/model/'
! gsutil -m cp -r $GDRIVE_PATH $BUCKET_PATH

# Pre-trained KoBert 모델 로드 & 사용

In [None]:
# model load
bert_model, vocab = get_pytorch_kobert_model()

# TPU 세팅
device = xm.xla_device()

In [None]:
from operator import ilshift

# GCP Bucket에 저장했던 모델을 구글 드라이브에 임시 위치로 가져온다
! gsutil -m cp -r $BUCKET_PATH ./

load_path = GDRIVE_PATH
load_model = torch.load(load_path + 'KoBERT_test.pt') # 전체 모델을 통째로 불러오기에 클래스 선언 필수!
load_model.load_state_dict(torch.load(load_path + 'model_state_dict.pt')) # state_dict를 불러온 후 모델에 저장

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)


# 이 모델에 맞는 새로운 softmax
def new_softmax(a):
    c = np.max(a)
    exp_a = np.exp(a-c)
    sum_exp_a = np.sum(exp_a)
    y=(exp_a / sum_exp_a) * 100
    return np.round(y, 3)


# 예측 모델 설정
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0 , 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=2)

    load_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length = valid_length
        label = label.long().to(device)

        out = load_model(token_ids, valid_length, segment_ids)

        test_eval = []
        for i in out :
            logits=i
            logits = logits.detach().cpu().numpy()
            min_v = min(logits)
            total=0
            probability = []
            logits = np.round(new_softmax(logits), 3).tolist()
            for logit in logits:
                #print(logit)
                probability.append(np.round(logit, 3))
            
            #print(probability)

    return probability

In [None]:
# 실제 문장 예측
prediction = predict('고양이가 갑자기 발을 긁고 이상한 울음소리를 내요')
category_list[np.argmax(prediction)] # ==> 개:교육/심리:훈련/행동교정:nan