In [4]:
TEXT = "특히, 행렬식을 계산할 수 있고 행렬식의 여러 성질을 알고 역행렬을 계산 할 수 있다. 연립방정식의 풀이법으로 가우스 소거법을 알고 계수정리를 이해한다. 또한, 활용이 높은 대각화를 할 수 있다."

In [1]:
from nltk.tokenize import word_tokenize

In [2]:
print(word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [3]:
from nltk.tokenize import WordPunctTokenizer

In [4]:
print(WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [5]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [6]:
print(text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


### Penn Treebank Tokenization

규칙 1. 하이푼으로 구성된 단어는 하나로 유지  
규칙 2. deson't와 같이 apostrophe로 '접어'가 함께하는 단어는 분리

In [7]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
text="Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


### Sentence Tokenization

In [8]:
from nltk.tokenize import sent_tokenize
text="His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to mae sure no one was near."
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to mae sure no one was near.']


In [9]:
text="I am actively looking for Ph.D. students. and you are a Ph.D student."
print(sent_tokenize(text))

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


### Part-ofspeech tagging (품사 태깅)

In [10]:
from nltk.tokenize import word_tokenize
text="I am actively looking for Ph.D. students. and you are a Ph.D. student."
print(word_tokenize(text))

['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']


In [12]:
from nltk.tag import pos_tag
x=word_tokenize(text)
pos_tag(x)

[('I', 'PRP'),
 ('am', 'VBP'),
 ('actively', 'RB'),
 ('looking', 'VBG'),
 ('for', 'IN'),
 ('Ph.D.', 'NNP'),
 ('students', 'NNS'),
 ('.', '.'),
 ('and', 'CC'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('a', 'DT'),
 ('Ph.D.', 'NNP'),
 ('student', 'NN'),
 ('.', '.')]

PRP : 인칭대명사  
VBP : 동사  
RB : 부사  
VGB : 현재부사  
IN : 전치사  
NNP : 고유명사  
NNS : 복수형 명사  
CC : 접속사  
DT : 관사

In [5]:
from konlpy.tag import Okt  
okt=Okt()  
print(okt.morphs(TEXT))

['특히', ',', '행렬식', '을', '계산', '할', '수', '있고', '행렬식', '의', '여러', '성질', '을', '알', '고', '역행렬', '을', '계산', '할', '수', '있다', '.', '연', '립', '방정식', '의', '풀이', '법', '으로', '가우스', '소거법', '을', '알', '고', '계수', '정리', '를', '이해', '한', '다', '.', '또한', ',', '활용', '이', '높은', '대각', '화', '를', '할', '수', '있다', '.']


In [7]:
print(okt.pos(TEXT))  

[('특히', 'Adverb'), (',', 'Punctuation'), ('행렬식', 'Noun'), ('을', 'Josa'), ('계산', 'Noun'), ('할', 'Verb'), ('수', 'Noun'), ('있고', 'Adjective'), ('행렬식', 'Noun'), ('의', 'Josa'), ('여러', 'Noun'), ('성질', 'Noun'), ('을', 'Josa'), ('알', 'Noun'), ('고', 'Josa'), ('역행렬', 'Noun'), ('을', 'Josa'), ('계산', 'Noun'), ('할', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('.', 'Punctuation'), ('연', 'Modifier'), ('립', 'Noun'), ('방정식', 'Noun'), ('의', 'Josa'), ('풀이', 'Noun'), ('법', 'Noun'), ('으로', 'Josa'), ('가우스', 'Noun'), ('소거법', 'Noun'), ('을', 'Josa'), ('알', 'Noun'), ('고', 'Josa'), ('계수', 'Noun'), ('정리', 'Noun'), ('를', 'Josa'), ('이해', 'Noun'), ('한', 'Josa'), ('다', 'Adverb'), ('.', 'Punctuation'), ('또한', 'Noun'), (',', 'Punctuation'), ('활용', 'Noun'), ('이', 'Josa'), ('높은', 'Adjective'), ('대각', 'Noun'), ('화', 'Suffix'), ('를', 'Josa'), ('할', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('.', 'Punctuation')]


In [8]:
print(okt.nouns(TEXT))  

['행렬식', '계산', '수', '행렬식', '여러', '성질', '알', '역행렬', '계산', '수', '립', '방정식', '풀이', '법', '가우스', '소거법', '알', '계수', '정리', '이해', '또한', '활용', '대각', '수']


In [10]:
from konlpy.tag import Kkma  
kkma=Kkma()  
print(kkma.morphs(TEXT))

['특히', ',', '행렬식', '을', '계산', '하', 'ㄹ', '수', '있', '고', '행렬식', '의', '여러', '성질', '을', '알', '고', '역행렬', '을', '계산', '하', 'ㄹ', '수', '있', '다', '.', '연립', '방정식', '의', '풀이', '법', '으로', '가우스', '소거법', '을', '알', '고', '계수', '정리', '를', '이해', '하', 'ㄴ다', '.', '또', '한', ',', '활용', '이', '높', '은', '대각', '화', '를', '하', 'ㄹ', '수', '있', '다', '.']
