# Word Tokenization

## 영어의 Word Tokenization

####  **NLTK의 토그나이저 1. word_tokenize**

In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
sentence = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

#### Q. 아포스트로피가 들어간 상황에서 단어 토근화

In [5]:
from nltk.tokenize import word_tokenize
print(word_tokenize(sentence))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


#### A. Don't를 Do와 n't로 분리, Jone's는 Jone과 's로 분리

-----------------------------

#### **NLTK의 토그나이저 2. WordPunctTokenizer**

In [6]:
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(sentence))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


#### Don't를 Don, ', t로 분리. Jone's를 Jone, ', s로 분리

-----------------

#### **NLTK의 토그나이저 3. TreebankWordTokenizer**

Penn Treebank Tokenizer의 규칙\
규칙1. 하이푼으로 구서된 단어는 하나로 유지\
규칙2. doesn't와 같이 아포스트로피로 '접어'가 함께하는 단어는 분리

In [7]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 한글의 Word Tokenization(KoNLPy)

In [1]:
from konlpy.tag import *

hannanum = Hannanum()
kkma = Kkma()
komoran = Komoran()
okt = Okt()
mecab = Mecab()

**위 형태소 분석기들은 공통적으로 아래의 함수를 제공합니다.**\
nouns: 명사 추출\
morphs: 형태소 추출\
pos: 품사 부착/

In [10]:
sentence_ = "열심히 코딩한 당신, 연휴에는 여행을 가봐요"

### 형태소 분석 Okt

In [11]:
print(okt.nouns(sentence_))
print(okt.morphs(sentence_))
print(okt.pos(sentence_))

['코딩', '당신', '연휴', '여행']
['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
[('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]


### 형태소 분석기 꼬꼬마

In [12]:
print(kkma.nouns(sentence_))
print(kkma.morphs(sentence_))
print(kkma.pos(sentence_))

['코딩', '당신', '연휴', '여행']
['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
[('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]


### 형태소 분석기 코모란

In [13]:
print(komoran.nouns(sentence_))
print(komoran.morphs(sentence_))
print(komoran.pos(sentence_))

['코', '당신', '연휴', '여행']
['열심히', '코', '딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '아', '보', '아요']
[('열심히', 'MAG'), ('코', 'NNG'), ('딩', 'MAG'), ('하', 'XSV'), ('ㄴ', 'ETM'), ('당신', 'NNP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKB'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가', 'VV'), ('아', 'EC'), ('보', 'VX'), ('아요', 'EC')]


### 형태소 분석기 한나눔

In [14]:
print(hannanum.nouns(sentence_))
print(hannanum.morphs(sentence_))
print(hannanum.pos(sentence_))

['코딩', '당신', '연휴', '여행']
['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에는', '여행', '을', '가', '아', '보', '아']
[('열심히', 'M'), ('코딩', 'N'), ('하', 'X'), ('ㄴ', 'E'), ('당신', 'N'), (',', 'S'), ('연휴', 'N'), ('에는', 'J'), ('여행', 'N'), ('을', 'J'), ('가', 'P'), ('아', 'E'), ('보', 'P'), ('아', 'E')]


### 형태소 분석기 Mecab

In [15]:
print(mecab.nouns(sentence_))
print(mecab.morphs(sentence_))
print(mecab.pos(sentence_))

['코딩', '당신', '연휴', '여행']
['열심히', '코딩', '한', '당신', ',', '연휴', '에', '는', '여행', '을', '가', '봐요']
[('열심히', 'MAG'), ('코딩', 'NNG'), ('한', 'XSA+ETM'), ('당신', 'NP'), (',', 'SC'), ('연휴', 'NNG'), ('에', 'JKB'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가', 'VV'), ('봐요', 'EC+VX+EC')]


# Sentence Tokenization

## 영어의 Sentence Tokenization(NLTK)

In [16]:
temp = 'Yonsei University is a private research university in Seoul, South Korea. Yonsei University is deemed as one of the three most prestigious institutions in the country. It is particularly respected in the studies of medicine and business administration.'

In [18]:
temp.split('. ')

['Yonsei University is a private research university in Seoul, South Korea',
 'Yonsei University is deemed as one of the three most prestigious institutions in the country',
 'It is particularly respected in the studies of medicine and business administration.']

**"온점을 기준으로 문장을 구분할 경우에는 예외사항이 너무 많다."**

In [21]:
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']


In [22]:
text="I am actively looking for Ph.D. students. and you are a Ph.D student."
print(sent_tokenize(text))

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


## 한국어의 Sentence Tokenization(KSS)

In [24]:
#pip install kss

In [25]:
import kss

text = '딥 러닝 자연어 처리가 재미있기는 합니다. 그런데 문제는 영어보다 한국어로 할 때 너무 어려워요. 이제 해보면 알걸요?'
print(kss.split_sentences(text))


[Kss]: Oh! You have mecab in your environment. Kss will take this as a backend! :D



['딥 러닝 자연어 처리가 재미있기는 합니다.', '그런데 문제는 영어보다 한국어로 할 때 너무 어려워요.', '이제 해보면 알걸요?']


### IDBM 리뷰 데이터를 이용한 정수 인코딩과 패딩

In [21]:
import sklearn

ModuleNotFoundError: No module named 'sklearn'

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import torch
import urllib.request
from tqdm import tqdm
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'