# 지도 학습(supervised learning) 기반 형태소 분석

> 직접 형태소 경계나 품사 정보를 모델에 가르쳐줘서 학습된 모델



In [None]:
# Mecab(은전한닢) 설치 
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
from konlpy.tag import Mecab
tokenizer = Mecab()
tokenizer.morphs('아버지가방에들어가신다')

['아버지', '가', '방', '에', '들어가', '신다']

In [None]:
tokenizer.pos('아버지가방에들어가신다') # 품사 확인

[('아버지', 'NNG'),
 ('가', 'JKS'),
 ('방', 'NNG'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('신다', 'EP+EC')]

In [None]:
# 한국어 형태소 분석기 설치 (Mecab 제외)
# 런타임 -> 세션 관리 -> 종료 -> 다시 연결
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 14.9 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0


In [None]:
from konlpy.tag import Okt, Mecab, Hannanum, Kkma, Komoran

def get_tokenizer(tokenizer_name):
  if tokenizer_name == 'komoran':
    tokenizer = Komoran()
  elif tokenizer_name == 'okt':
    tokenizer = Okt()
  elif tokenizer_name == 'hannanum':
    tokenizer = Hannanum()
  elif tokenizer_name == 'kkma':
    tokenizer = Kkma()
  else:
    tokenizer = Mecab()
  return tokenizer

tokenizer = get_tokenizer('komoran')
# tokenizer = get_tokenizer('okt')
# tokenizer = get_tokenizer('hannanum')
# tokenizer = get_tokenizer('kkma')

tokenizer.morphs('아버지가방에들어가신다')

['아버지', '가방', '에', '들어가', '시', 'ㄴ다']

In [None]:
# khaiii(Kakao Hangul Analyzer III) 설치
# : CNN(Convolution Neural Network) 사용, C++로 구현, GPU 없어도 됨, 빠름
!git clone https://github.com/kakao/khaiii.git
!pip install cmake
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

In [None]:
from khaiii import KhaiiiApi

tokenizer = KhaiiiApi()
data = tokenizer.analyze('아버지가방에들어가신다')
tokens = []
for word in data:
  tokens.extend([str(m).split('/')[0] for m in word.morphs])
print(tokens)

['아버지', '가', '방', '에', '들어가', '시', 'ㄴ다']


In [None]:
tokens = []
for word in data:
  tokens.extend([str(m) for m in word.morphs])
print(tokens)

['아버지/NNG', '가/JKS', '방/NNG', '에/JKB', '들어가/VV', '시/EP', 'ㄴ다/EC']
