### 토큰화(Tokenization)

In [1]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [2]:
input_words = "Don't forget to bring Choi's pen"

# 방법1) word_tokenize 사용
w1 = word_tokenize(input_words) 
print("using word_tokenize: ", w1)

# 방법2) wordPunctTokenizer 사용
w2 = WordPunctTokenizer().tokenize(input_words)
print("using WordPunctTokenizer: ", w2)

# 방법3) text_to_word_sequence 사용
w3 = text_to_word_sequence(input_words)
print("using text_to_word_sequence: ", w3)

using word_tokenize:  ['Do', "n't", 'forget', 'to', 'bring', 'Choi', "'s", 'pen']
using WordPunctTokenizer:  ['Don', "'", 't', 'forget', 'to', 'bring', 'Choi', "'", 's', 'pen']
using text_to_word_sequence:  ["don't", 'forget', 'to', 'bring', "choi's", 'pen']


In [4]:
#표준토큰화 사용
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
text = "Hi, I'm a student and 18-year-old. It's beautiful day today!"
t1 = tokenizer.tokenize(text)

print("result: ", t1)

result:  ['Hi', ',', 'I', "'m", 'a', 'student', 'and', '18-year-old.', 'It', "'s", 'beautiful', 'day', 'today', '!']


In [13]:
# 문장 토큰화
from nltk.tokenize import sent_tokenize

text = "e.g stands for 'exempli gratia', meaning 'for example'. \
        This is used to provide specific examples that fall under a more general category."
print("result: ", sent_tokenize(text))

result:  ["e.g stands for 'exempli gratia', meaning 'for example'.", 'This is used to provide specific examples that fall under a more general category.']


In [1]:
import kss

In [3]:
text ="IP 192.196.60.3 서버에서 확인 가능해. 로그 파일을 aaa@naver.com으로 보내줘."
print("결과 : ", kss.split_sentences(text))

결과 :  ['IP 192.196.60.3 서버에서 확인 가능해.', '로그 파일을 aaa@naver.com으로 보내줘.']


In [8]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chy/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [10]:
# 품사태깅 예제
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = "I am actively looking for Ph.D. students. I'm glad to know you are a Ph.D. student."
result = word_tokenize(text)

print('토큰화 결과 : ', result)
print('품사 태깅 : ', pos_tag(result))

토큰화 결과 :  ['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'I', "'m", 'glad', 'to', 'know', 'you', 'are', 'a', 'Ph.D.', 'student', '.']
품사 태깅 :  [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('I', 'PRP'), ("'m", 'VBP'), ('glad', 'JJ'), ('to', 'TO'), ('know', 'VB'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]


In [11]:
# 한국어 품사 태깅 예제
from konlpy.tag import Okt

okt = Okt()
text ="IP 192.196.60.3 서버에서 확인 가능해. 로그 파일을 aaa@naver.com으로 보내줘."

print('OKT 형태소 분석 : ', okt.morphs(text))
print('OKT 품사 태깅 : ', okt.pos(text))
print('OKT 명사 추출 : ', okt.nouns(text))

OKT 형태소 분석 :  ['IP', '192.196', '.', '60.3', '서버', '에서', '확인', '가능해', '.', '로그', '파일', '을', 'aaa@naver.com', '으로', '보내줘', '.']
OKT 품사 태깅 :  [('IP', 'Alpha'), ('192.196', 'Number'), ('.', 'Punctuation'), ('60.3', 'Number'), ('서버', 'Noun'), ('에서', 'Josa'), ('확인', 'Noun'), ('가능해', 'Adjective'), ('.', 'Punctuation'), ('로그', 'Noun'), ('파일', 'Noun'), ('을', 'Josa'), ('aaa@naver.com', 'Email'), ('으로', 'Josa'), ('보내줘', 'Verb'), ('.', 'Punctuation')]
OKT 명사 추출 :  ['서버', '확인', '로그', '파일']
