# 사용 라이브러리 설치 및 import

## pytorch tpu를 사용하기 위한 라이브러리 세팅
- PyTorch 에서 .to(device) 문법을 통해 텐서 변수들과 모델들을 GPU or TPU 같은 device에 올릴 수 있다.
- TPU 올리기 위해서는 torch_xls에서 제공하는 xm.xla_device()를 통해 PyTorch와 호환되는 device를 지정할 수 있다.

In [1]:
# PyTorch/XLA 설치
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.9
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl (149.9 MB)
[K     |████████████████████████████████| 149.9 MB 27 kB/s 
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.4 MB/s 
Installing collected packages: google-api-python-client, torch-xla, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.10
    Uninstalling google-api-python-client-1.12.10:
      Successfully uninstalled google-api-python-client-1.12.10
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
earthengine-api 0.1.301 req

- torch_xla 설치하고 import하면 error가 등장한다.
- 이를 해결하기 위해서는 pytorch version을 downgrade 해주면 된다.(torch-1.8.2+cpu)

In [2]:
!pip uninstall -y torch
!pip install torch==1.8.2+cpu torchvision==0.9.2+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html

Found existing installation: torch 1.10.0+cu111
Uninstalling torch-1.10.0+cu111:
  Successfully uninstalled torch-1.10.0+cu111
Looking in links: https://download.pytorch.org/whl/lts/1.8/torch_lts.html
Collecting torch==1.8.2+cpu
  Downloading https://download.pytorch.org/whl/lts/1.8/cpu/torch-1.8.2%2Bcpu-cp37-cp37m-linux_x86_64.whl (169.1 MB)
[K     |████████████████████████████████| 169.1 MB 57 kB/s 
[?25hCollecting torchvision==0.9.2+cpu
  Downloading https://download.pytorch.org/whl/lts/1.8/cpu/torchvision-0.9.2%2Bcpu-cp37-cp37m-linux_x86_64.whl (13.3 MB)
[K     |████████████████████████████████| 13.3 MB 778 kB/s 
Installing collected packages: torch, torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.11.1+cu111
    Uninstalling torchvision-0.11.1+cu111:
      Successfully uninstalled torchvision-0.11.1+cu111
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is

- verxion 호환성 맞춰주기 위해 torch-xla-1.8 다시 설치

In [3]:
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.8
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl (144.6 MB)
[K     |████████████████████████████████| 144.6 MB 75 kB/s 
Installing collected packages: torch-xla
  Attempting uninstall: torch-xla
    Found existing installation: torch-xla 1.9
    Uninstalling torch-xla-1.9:
      Successfully uninstalled torch-xla-1.9
Successfully installed torch-xla-1.8


In [4]:
import torch_xla
import torch_xla.core.xla_model as xm

## KoBERT 사용하기 위한 라이브러리 세팅

In [7]:
# 필요한 라이브러리 설치
!pip install mxnet
!pip install gluonnlp
!pip install transformers==3.0.2

Collecting mxnet
  Downloading mxnet-1.9.0-py3-none-manylinux2014_x86_64.whl (47.3 MB)
[K     |████████████████████████████████| 47.3 MB 1.6 MB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.9.0
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 12.6 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595727 sha256=1bf4156563968c8f786be288650331de812741bb96d68735cd85a8f4af844489
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b79c041e023acffbe395d

In [8]:
# KoBERT 설치
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-zo5pvcfy
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-zo5pvcfy
Collecting boto3
  Downloading boto3-1.21.18-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 12.6 MB/s 
Collecting onnxruntime==1.8.0
  Downloading onnxruntime-1.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 48.6 MB/s 
Collecting transformers>=4.8.1
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 42.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-ma

In [9]:
# KoBERT 사용 위한 라이브러리
import os
import sys
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

# kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

# transformaer
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# 구글 드라이브, 코랩, GCP, TPU 환경 세팅

In [10]:
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 155335 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.27-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [11]:
# google drive GCP 연동
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()

import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()

!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [12]:
# Gdrice 폴더 및 colab 연동
!mkdir -p Gdrive

!google-drive-ocamlfuse Gdrive -o nonempty

!ls

adc.json  Gdrive  sample_data


# 데이터 가공 및 준비

In [21]:
import pandas as pd

FILE_PATH = '/content/Gdrive/오내피플/DB/Q&A_개발 활용 가능_2020.10.xlsx'
df = pd.read_excel(FILE_PATH)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   번호                1 non-null      float64
 1   종류
(개/고양이)        1068 non-null   object 
 2   대분류               1068 non-null   object 
 3   중분류               1068 non-null   object 
 4   소분류               578 non-null    object 
 5   Question
(질문 정리)  1068 non-null   object 
 6   Answer2 (답변 정리)   1065 non-null   object 
 7   Hashtag 2         1061 non-null   object 
 8   Unnamed: 8        4 non-null      object 
 9   건강/의료             1 non-null      float64
 10  환경/생활             1 non-null      float64
 11  교육/심리             1 non-null      float64
 12  단순문의              1 non-null      float64
 13  반려상품              1 non-null      float64
 14  법률                1 non-null      float64
 15  입양/장례             1 non-null      float64
 16  Unnamed: 16       0 non-null      float64


In [23]:
df.head()

Unnamed: 0,번호,종류\n(개/고양이),대분류,중분류,소분류,Question\n(질문 정리),Answer2 (답변 정리),Hashtag 2,Unnamed: 8,건강/의료,환경/생활,교육/심리,단순문의,반려상품,법률,입양/장례,Unnamed: 16,총합
0,1.0,개,건강/의료,기타,,감기 걸린 강아지도 산책을 시켜야 하나요?,강아지에게 산책은 중요한 부분이지만 컨디션이 저조하거나 감기에 걸린 채로 나가는 건...,"#강아지, #감기, #산책, #놀이",,319.0,265.0,317.0,73.0,21.0,3.0,70.0,,1068.0
1,,개,건강/의료,기타,,암컷 강아지의 생리와 혈뇨는 어떻게 구분하나요?,생리도 혈뇨도 피가 나오기 때문에 혼동이 올 수 있어요. 하지만 두 증상에 확실한 ...,"#강아지, #생리, #혈뇨, #출혈, #차이점, #자궁축농증, #배뇨",,,,,,,,,,
2,,개,건강/의료,기타,,강아지도 생리를 하나요?,강아지도 사람과 마찬가지로 생리를 해요. 보통 생후 6개월~12개월 사이에 시작하고...,"#강아지, #초경, #생리, #생리주기, #생리기간, #생리대, #기저귀, #폐경",,,,,,,,,,
3,,개,건강/의료,기타,,강아지가 커피를 먹어도 되나요?,커피는 카페인 함량이 높아 동물들에게 치명적이에요. 섭취한 양을 확인하신 후 동물병...,"#강아지, #커피, #이물섭식, #카페인, #위험 #치명적 #섭취불가",,,,,,,,,,
4,,개,건강/의료,기타,,강아지는 방구를 참지 못하나요?,강아지는 방귀를 참기 어려울수 있어요.,#강아지방귀,,,,,,,,,,


In [24]:
df.tail()

Unnamed: 0,번호,종류\n(개/고양이),대분류,중분류,소분류,Question\n(질문 정리),Answer2 (답변 정리),Hashtag 2,Unnamed: 8,건강/의료,환경/생활,교육/심리,단순문의,반려상품,법률,입양/장례,Unnamed: 16,총합
1063,,고양이,환경/생활,양육,,고양이 발정기 증상이 뭐가 있나요?,고양이 발정기 증상은\n남자고양이는 특히 스프레이 라는 행동을 해요. 주변을 돌아다...,#암컷고양이 #수컷고양이 #남자고양이 #여자고양이 #고양이발정기증상 #발정기 #발정...,,,,,,,,,,
1064,,고양이,환경/생활,양육,,고양이가 계속 우는데 발정기가 온건가요?,고양이 발정기 증상은\n남자고양이는 특히 스프레이 라는 행동을 해요. 주변을 돌아다...,#암컷고양이 #수컷고양이 #남자고양이 #여자고양이 #고양이발정기증상 #발정기 #발정...,,,,,,,,,,
1065,,고양이,환경/생활,양육,,고양이 비만인가요? 비만인데 어떻게 관리하면 될까요?,비만도를 체크하려면 고양이를 위에서 봤을 때 타원형 체형이면 비만이라고 보면 돼요....,#비만고양이 #뚱냥이 #고양이위에서타원형비만 #뼈만져지면정상 #식단관리 #식단조절 ...,,,,,,,,,,
1066,,고양이,환경/생활,양육,,고양이 비만인것 같은데 어떻게 관리하면 될까요?,고양이 건강을 위해서 자율 급식보다는 제한 급식으로 식단을 조절해야 해요.\n\n고...,#비만고양이 #뚱냥이 #식단관리 #식단조절 #제한급식 #캣휠 #고양이장난감 #고양이...,,,,,,,,,,
1067,,고양이,환경/생활,환경/생활,기타,고양이를 키울 때 주의해야 하는 건 어떤 게 있나요?,"① 빗질, 발톱 다듬기 등 건강과 미용을 위한 피모관리\n고양이는 강아지와 다르게 ...","#고양이, #기본관리, #주의사항",,,,,,,,,,


In [25]:
# 컬럼명 변경 (종류)
re_columns = ['번호', '종류', '대분류', '중분류', '소분류', 'Question',
       'Answer', 'Hashtag 2', 'Unnamed: 8', '건강/의료', '환경/생활', '교육/심리',
       '단순문의', '반려상품', '법률', '입양/장례', 'Unnamed: 16', '총합']
df.columns = re_columns

df.head(2)

Unnamed: 0,번호,종류,대분류,중분류,소분류,Question,Answer,Hashtag 2,Unnamed: 8,건강/의료,환경/생활,교육/심리,단순문의,반려상품,법률,입양/장례,Unnamed: 16,총합
0,1.0,개,건강/의료,기타,,감기 걸린 강아지도 산책을 시켜야 하나요?,강아지에게 산책은 중요한 부분이지만 컨디션이 저조하거나 감기에 걸린 채로 나가는 건...,"#강아지, #감기, #산책, #놀이",,319.0,265.0,317.0,73.0,21.0,3.0,70.0,,1068.0
1,,개,건강/의료,기타,,암컷 강아지의 생리와 혈뇨는 어떻게 구분하나요?,생리도 혈뇨도 피가 나오기 때문에 혼동이 올 수 있어요. 하지만 두 증상에 확실한 ...,"#강아지, #생리, #혈뇨, #출혈, #차이점, #자궁축농증, #배뇨",,,,,,,,,,


In [26]:
# 카테고리 별 분류 파싱하여 카테고리 트리 생성

category_dict = {}
category_list = []

category_list.append(0)
idx=1

for i, row in df.iterrows() :

    if row['종류'] != '개' and row['종류'] != '고양이':
        continue
    if row['종류'] not in category_dict :
        category_dict[row['종류']] = {}
    if row['대분류'] not in category_dict[row['종류']]:
        category_dict[row['종류']][row['대분류']] = {}
    if row['중분류'] not in category_dict[row['종류']][row['대분류']]:
        category_dict[row['종류']][row['대분류']][row['중분류']] = {}
    if row['소분류'] not in category_dict[row['종류']][row['대분류']][row['중분류']]:
        category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']] = idx

        category_list.append(str(row['종류']) + ':' + str(row['대분류']) + ':' + str(row['중분류']) + ':' + str(row['소분류']))
        idx += 1

print(category_dict)
print(category_list)
print(len(category_list))

{'개': {'건강/의료': {'기타': {nan: 1}, '치료': {'간단건강체크': 2, '구조/구호': 3, '수술': 4, '예방/약': 5, '예방접종': 6, '응급처치': 7, '중성화': 8, '질병/증상': 9, '질병/치료': 10}}, '교육/심리': {'감정/심리': {nan: 11}, '놀이/산책': {nan: 12}, '행동': {nan: 13}, '훈련/행동교정': {nan: 14}}, '단순문의': {'기타': {nan: 15}, '보호자 고민': {nan: 16}, '품종문의': {nan: 17}}, '반려상품': {'기본용품': {'목욕/미용용품': 18}, '식품': {'건강관리': 19, '사료/간식': 20}}, '법률': {'사람/동물': {nan: 21}, '의료사고': {nan: 22}}, '입양/장례': {'기타': {nan: 23}, '입양/분양': {'기타': 24, '입양 받을 때': 25, '입양 보낼 때': 26, '입양받을때': 27}, '장례': {'장례': 28}}, '환경/생활': {'교배/출산': {'교배': 29, '임신/출산': 30}, '급여/간식': {'급여방법': 31, '기타': 32}, '기타': {nan: 33}, '놀이/산책': {nan: 34}, '양육': {'기타': 35, '목욕/미용': 36, '양육': 37, nan: 38}, '환경/생활': {'기타': 39}}}, '고양이': {'건강/의료': {'기타': {nan: 40}, '치료': {'간단건강체크': 41, '구조/구호': 42, '수술': 43, '예방/약': 44, '예방접종': 45, '응급': 46, '응급처치': 47, '중성화': 48, '질병/증상': 49, '질병/치료': 50}}, '교육/심리': {'감정/심리': {nan: 51}, '기타': {nan: 52}, '놀이/산책': {nan: 53}, '행동': {nan: 54}, '훈련/행동교정': {nan: 55}}, '단순문의': {'기타': {

In [27]:
NUM_LABELS = len(category_list)
# NUM_LABELS = 76

In [28]:
import re

PATTERN = '[^\w\s]'
PATTERN_MULTI_SPACE = ' +'
PATTERN_LINE_BREAKER = '\n|\r'
REPL = " "

def isNaN(string):
    return string != string

train_raw = {
    'label': [],
    'alpha': [],
    'text': []
}

continued_value = {
    '종류' : [],
    '대분류' : [],
    '중분류' : [],
    '소분류' : [],
    'Question' : []
}

# Question
for i, row in df.iterrows():
    if isNaN(row['Question']) : row['Question'] = REPL
    if row['종류'] != '개' and row['종류'] != '고양이' and row['Question'] != '' and row['Question'] != 'None':
        continue
    else:
        try:
            qu = re.sub(pattern=PATTERN, repl=REPL, string=str(row['Question']))
            qu = re.sub(pattern=PATTERN_LINE_BREAKER, repl=REPL, string=qu)
            qu = re.sub(pattern=PATTERN_MULTI_SPACE, repl=REPL, string=qu)

            if(qu != REPL) : qu = qu.strip()
            train_raw['label'].append(category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']])
            train_raw['alpha'].append('a')
            train_raw['text'].append(qu)
        
        except:
            continued_value['종류'].append(row['종류'])
            continued_value['대분류'].append(row['대분류'])
            continued_value['중분류'].append(row['중분류'])
            continued_value['소분류'].append(row['소분류'])
            continued_value['Question'].append(row['Question'])

# Answer
# for i, row in df.iterrows():
#     if isNaN(row['Answer']): 
#         row['Answer'] = REPL
#     if row['종류'] != '개' and row['종류'] != '고양이' and row['Answer'] != '' and row['Answer'] != 'None':
#         continue
#     else :
#         try:
#             qu = re.sub(pattern=PATTERN, repl=REPL, string=str(row['Answer']))
#             qu = re.sub(pattern=PATTERN_LINE_BREAKER, repl=REPL, string=qu)
#             qu = re.sub(pattern=PATTERN_MULTI_SPACE, repl=REPL, string=qu)
#             if(qu != REPL) : qu = qu.strip()
#             qu = qu[:256] if len(qu) > 256 else qu 
#             train_raw['label'].append(category_dict[row['종류']][row['대분류']][row['중분류']][row['소분류']])
#             train_raw['alpha'].append('a')
#             train_raw['text'].append(qu)
#         except:
#             continued_value['종류'].append(row['종류'])
#             continued_value['대분류'].append(row['대분류'])
#             continued_value['중분류'].append(row['중분류'])
#             continued_value['소분류'].append(row['소분류'])
#             continued_value['Question'].append(row['Question'])

train =pd.DataFrame(train_raw)

train = train.reset_index() # ==> index 재설정
train = train.rename(columns={'index':'id'}) # index라는 column을 id로 변동

# 에러 난 값들
error_values = pd.DataFrame(continued_value)

In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1068 non-null   int64 
 1   label   1068 non-null   int64 
 2   alpha   1068 non-null   object
 3   text    1068 non-null   object
dtypes: int64(2), object(2)
memory usage: 33.5+ KB


In [30]:
train

Unnamed: 0,id,label,alpha,text
0,0,1,a,감기 걸린 강아지도 산책을 시켜야 하나요
1,1,1,a,암컷 강아지의 생리와 혈뇨는 어떻게 구분하나요
2,2,1,a,강아지도 생리를 하나요
3,3,1,a,강아지가 커피를 먹어도 되나요
4,4,1,a,강아지는 방구를 참지 못하나요
...,...,...,...,...
1063,1063,74,a,고양이 발정기 증상이 뭐가 있나요
1064,1064,74,a,고양이가 계속 우는데 발정기가 온건가요
1065,1065,74,a,고양이 비만인가요 비만인데 어떻게 관리하면 될까요
1066,1066,74,a,고양이 비만인것 같은데 어떻게 관리하면 될까요


In [31]:
# pytorch에 맞는 dataset 만들기
data_list = []
for text, label in zip(train['text'], train['label']):
    data = []
    data.append(text)
    data.append(str(label))

    data_list.append(data)

In [32]:
# 데이터 확인
print(len(data_list))
print(data_list[0])
print(data_list[600])
print(data_list[100])
print(data_list[-1])

1068
['감기 걸린 강아지도 산책을 시켜야 하나요', '1']
['강아지를 키울 때 주의해야 하는 건 어떤 게 있나요', '39']
['새끼강아지 유치는 언제 빠질까요', '10']
['고양이를 키울 때 주의해야 하는 건 어떤 게 있나요', '75']


In [33]:
# 데이터 분리
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(data_list, test_size=0.1, random_state=0)

print(len(dataset_train))
print(len(dataset_test))

961
107


# KoBERT 학습 모델

In [None]:
# model load
bert_model, vocab = get_pytorch_kobert_model()

#GPU 세팅 및 사용 GPU 확인
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [17]:
# 데이터셋 구축 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, text_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair
        )
        self.sentences=[transform([i[text_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [41]:
# 파라미터 세팅
max_len = 64
batch_size = 16
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 2e-5

In [19]:
# 토크나이저
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [34]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [35]:
# 데이터 확인
data_train[0]

(array([   2, 3732, 6730, 3245, 2986, 7811, 1772, 5591, 6999,    3,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(10, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 74)

In [36]:
# 데이터 로더
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=2)

In [37]:
# BERT 분류기 클래스 생성
class BERTClassifier(nn.Module):
    def __init__(self, 
                 bert, 
                 hidden_size=768,
                 num_classes=NUM_LABELS, # ==> label 개수
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, 
                              token_type_ids=segment_ids.long(), 
                              attention_mask=attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [42]:
# bert 모델 불러오기
model = BERTClassifier(bert_model, dr_rate=0.5).to(device)

# optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params':[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay':0.01},
    {'params':[p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 측정을 위한 함수 정의
def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices==Y).sum().data.cpu().numpy() / max_indices.size()[0]
    return train_acc

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0

    # 모델 학습
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() # ==> update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print('epoch {} batch id {} loss {} train acc {}'.format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print('epoch {} train acc {}'.format(e+1, train_acc / (batch_id+1)))

    #모델 검증
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print('epoch {} test acc {}'.format(e+1, test_acc/(batch_id+1)))
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 4.4305033683776855 train acc 0.0
epoch 1 train acc 0.0471311475409836


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 test acc 0.057629870129870135


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 4.231025695800781 train acc 0.0625
epoch 2 train acc 0.11168032786885246


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 test acc 0.07548701298701299


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 4.029917240142822 train acc 0.0625
epoch 3 train acc 0.1567622950819672


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 test acc 0.12905844155844157


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 3.829477310180664 train acc 0.0625
epoch 4 train acc 0.2336065573770492


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 4 test acc 0.17775974025974026


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 3.6173460483551025 train acc 0.125
epoch 5 train acc 0.26434426229508196


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 5 test acc 0.1737012987012987


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 3.4714388847351074 train acc 0.125
epoch 6 train acc 0.29200819672131145


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 6 test acc 0.19561688311688313


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 3.266418933868408 train acc 0.25
epoch 7 train acc 0.3073770491803279


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 7 test acc 0.23133116883116883


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 3.147491455078125 train acc 0.3125
epoch 8 train acc 0.32684426229508196


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 8 test acc 0.24025974025974026


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 3.172420024871826 train acc 0.1875
epoch 9 train acc 0.33709016393442626


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 9 test acc 0.2491883116883117


  0%|          | 0/61 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 3.061682939529419 train acc 0.25
epoch 10 train acc 0.3432377049180328


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 10 test acc 0.2491883116883117


# KoBERT Pre-trained model 저장

In [45]:
# GDRIVE에 모델 저장하기 위한 directory 생성
! mkdir kobert

# 학습 모델 저장
GDRIVE_PATH = '/content/kobert/'

# 모델 저장
torch.save(model, GDRIVE_PATH + 'KoBERT_test.pt')
# state dict 저장
torch.save(model.state_dict(), GDRIVE_PATH + 'model_state_dict.pt')
# 모델 정보가 담긴 tar 파일 저장
torch.save({
    'model' : model.state_dict(),
    'optimizer' : optimizer.state_dict()
}, GDRIVE_PATH + 'all.tar')

In [47]:
# GCS bucket에 복붙
BUCKET_PATH = 'gs://nlp_pretrianed_model/model'
!gsutil -m cp -r $GDRIVE_PATH $BUCKET_PATH # ==> 그럼 여기 경로는 'gs://nlp_pretrianed_model/model/kobert' 폴더 안에 위 3개 파일이 저장된다.

Copying file:///content/kobert/all.tar [Content-Type=application/x-tar]...
/ [0/3 files][    0.0 B/  1.7 GiB]   0% Done                                    ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file:///content/kobert/model_state_dict.pt [Content-Type=application/octet-stream]...
Copying file:///content/kobert/KoBERT_test.pt [Content-Type=application/oc

# Pre-trained KoBert 모델 로드 & 예측


In [50]:
# GCP Bucket에서 저장한 모델을 복사해서 붙여 넣는다.
!gsutil -m cp -r $BUCKET_PATH ./

Copying gs://nlp_pretrianed_model/model/kobert/KoBERT_test.pt...
/ [0/4 files][    0.0 B/  1.7 GiB]   0% Done                                    Copying gs://nlp_pretrianed_model/model/kobert/all.tar...
Copying gs://nlp_pretrianed_model/model/kobert/model_state_dict.pt...


In [56]:
from operator import ilshift

new_path = '/content/model/kobert/'
load_model = torch.load(new_path + 'KoBERT_test.pt') # 전체 모델을 통째로 불러오기에 클래스 선언 필수!
load_model.load_state_dict(torch.load(new_path + 'model_state_dict.pt')) # state_dict를 불러온 후 모델에 저장

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

# 이 모델에 맞는 새로운 softmax
def new_softmax(a):
    c = np.max(a)
    exp_a = np.exp(a-c)
    sum_exp_a = np.sum(exp_a)
    y=(exp_a / sum_exp_a) * 100
    return np.round(y, 3)


# 예측 모델 설정
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0 , 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=2)

    load_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length = valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        test_eval = []
        for i in out :
            logits=i
            logits = logits.detach().cpu().numpy()
            min_v = min(logits)
            total=0
            probability = []
            logits = np.round(new_softmax(logits), 3).tolist()
            for logit in logits:
                # print(logit)
                probability.append(np.round(logit, 3))
            
            print(probability)
            
    return probability

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
prediction = predict('고양이가 갑자기 발을 긁고 이상한 울음소리를 내요')
category_list[np.argmax(prediction)]