In [1]:
!sudo apt-get install curl git
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.13).
git is already the newest version (1:2.17.1-1ubuntu0.8).
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
Installing automake (A dependency for mecab-ko)
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubunt

In [2]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 51.0 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.3 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [3]:
import sys, re, argparse
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma

In [4]:
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran": # 코모란
        tokenizer = Komoran()
    elif tokenizer_name == "okt": # Okt
        tokenizer = Okt()
    elif tokenizer_name == "mecab": # 은전한닢
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum": # 한나눔
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii": # Khaiii
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer

In [5]:

def tokenize(tokenizer_name, corpus_fname, output_fname, pos=False):
    tokenizer = get_tokenizer(tokenizer_name)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            if tokenizer_name == "khaiii":
                tokens = []
                for word in tokenizer.analyze(sentence):
                    if pos:
                        tokens.extend([str(m) for m in word.morphs])
                    else:
                        tokens.extend([str(m).split("/")[0] for m in word.morphs])
            else:
                if pos:
                    tokens = tokenizer.pos(sentence)
                    tokens = [morph + "/" + tag for morph, tag in tokens]
                else:
                    tokens = tokenizer.morphs(sentence)
            tokenized_sent = ' '.join(post_processing(tokens))
            f2.writelines(tokenized_sent + '\n')

In [6]:
tokenizer = get_tokenizer('komoran')
tokenizer.morphs('아버지가방에들어가신다') # 토크나이즈 (형태소 분석)

['아버지', '가방', '에', '들어가', '시', 'ㄴ다']

In [7]:
tokenizer.pos('아버지가방에들어가신다') # 품사정보확인

[('아버지', 'NNG'),
 ('가방', 'NNP'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('시', 'EP'),
 ('ㄴ다', 'EC')]

In [8]:
# 사용자 사전 추가
tokenizer = get_tokenizer('mecab') # 은전한닢
tokenizer.morphs('가우스전자 텔레비전 정말 좋네요')


['가우스', '전자', '텔레비전', '정말', '좋', '네요']

In [9]:
tokenizer = get_tokenizer('mecab')
tokenizer.morphs('희망') # 토크나이즈 (형태소 분석)

['희망']