### [ 자연어 처리용 형태소 분석 - NLTK ]

[1] 모듈 로딩 <hr>

In [57]:
# ================================================================================
# [1-1] 모듈 로딩
# ================================================================================
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer

[2] 형태소 분석 <hr>

In [58]:
# ================================================================================
# [2-1] 단어 단위 분리
# ================================================================================
text="Happy, New Year! Don't stop. "

# 분리 규칙 : 구두점과 문장 토큰화 기반
# word_tokenize() 함수 사용
wordList1 = word_tokenize(text)
print("wordList1 : ", wordList1)

# 토큰화 인스턴스 생성 및 메서드 사용
psTK = PunktSentenceTokenizer()
wordList2 = psTK.tokenize(text)
print("wordList2 : ", wordList2)
# ================================================================================
# [2-2] 문장 단위 분리
# ================================================================================
text = 'The Matrix is everywhere its all around us, here even in this room. \
You can see it out your window or on your television. \
You feel it when you go to work, or go to church or pay your taxes.'

sentList1 = sent_tokenize(text)
print("sentList1 : ", sentList1)

wpTK = WordPunctTokenizer()
sentList2 = wpTK.tokenize(text)
print("sentList2 : ", sentList2)

wordList1 :  ['Happy', ',', 'New', 'Year', '!', 'Do', "n't", 'stop', '.']
wordList2 :  ['Happy, New Year!', "Don't stop."]
sentList1 :  ['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']
sentList2 :  ['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.', 'You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.', 'You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']


[3] 텍스트의 토큰화 처리 <hr>

In [63]:
text = 'The Matrix is everywhere its all around us, here even in this room. \
You can see it out your window or on your television. \
You feel it when you go to work, or go to church or pay your taxes.'

# 텍스트의 모든 토큰 저장 변수
allTK = set()
sentTK = []

# 텍스트 -> 문장 분리
sentList = sent_tokenize(text)

# 문장 -> 단어 토큰 분리
for sent in sentList:
    wordTK = word_tokenize(sent)
    sentTK.append(wordTK)
    allTK.update(wordTK)

# 확인
for tk in sentTK:
    print(tk)

print(f"\n모든 토큰들 : {allTK}, \n{len(allTK)}개 ")

['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']
['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.']
['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']

모든 토큰들 : {'is', 'even', '.', 'your', 'window', 'you', 'all', 'here', 'pay', 'go', 'church', 'this', 'Matrix', 'room', 'around', 'us', 'everywhere', 'it', 'work', ',', 'out', 'to', 'You', 'or', 'see', 'The', 'in', 'television', 'on', 'can', 'taxes', 'its', 'when', 'feel'}, 
34개 
