# Word Tokenization

In [19]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer

In [20]:
corpus = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."

word_tokenize(corpus)

['Do',
 "n't",
 'be',
 'fooled',
 'by',
 'the',
 'dark',
 'sounding',
 'name',
 ',',
 'Mr.',
 'Jone',
 "'s",
 'Orphanage',
 'is',
 'as',
 'cheery',
 'as',
 'cheery',
 'goes',
 'for',
 'a',
 'pastry',
 'shop',
 '.']

In [21]:
WordPunctTokenizer().tokenize(corpus) # 구두점을 별도로 분류

['Don',
 "'",
 't',
 'be',
 'fooled',
 'by',
 'the',
 'dark',
 'sounding',
 'name',
 ',',
 'Mr',
 '.',
 'Jone',
 "'",
 's',
 'Orphanage',
 'is',
 'as',
 'cheery',
 'as',
 'cheery',
 'goes',
 'for',
 'a',
 'pastry',
 'shop',
 '.']

## Penn Treebank Tokenization

- 표준으로 쓰이는 토큰화 방법
- 규칙
    1. -으로 구성된 단어는 하나로 유지
    2. '로 접어가 함께하는 단어는 분리

In [22]:
from nltk.tokenize import TreebankWordTokenizer

corpus = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."

TreebankWordTokenizer().tokenize(corpus)

['Starting',
 'a',
 'home-based',
 'restaurant',
 'may',
 'be',
 'an',
 'ideal.',
 'it',
 'does',
 "n't",
 'have',
 'a',
 'food',
 'chain',
 'or',
 'restaurant',
 'of',
 'their',
 'own',
 '.']

# Sentence Tokenization

## nltk.sent_tokenize

In [23]:
from nltk.tokenize import sent_tokenize

corpus = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."

sent_tokenize(corpus)

['His barber kept his word.',
 'But keeping such a huge secret to himself was driving him crazy.',
 'Finally, the barber went up a mountain and almost to the edge of a cliff.',
 'He dug a hole in the midst of some reeds.',
 'He looked about, to make sure no one was near.']

In [24]:
corpus = "I am actively looking for Ph.D. students. and you are a Ph.D student."

sent_tokenize(corpus) # 중간에 마침표가 나와도 잘 구분

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']

# 품사 tagging

## 영어

In [25]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [26]:
corpus = "I am actively looking for Ph.D. students. And you are a Ph.D. student"
tokenized_sentence = word_tokenize(corpus)

print(f"단어 토큰화: {tokenized_sentence}")
print(f"품사 tagging: {pos_tag(tokenized_sentence)}")

단어 토큰화: ['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'And', 'you', 'are', 'a', 'Ph.D.', 'student']
품사 tagging: [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('And', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN')]


## 한국어

- Okt(Open Korea Text)
- Mecab
- Komoran
- Hannanum
- Kkma

In [27]:
from konlpy.tag import Okt

okt = Okt()



In [28]:
corpus = "열심히 코딩한, 당신, 연휴에는 여행을 가봐요. 근데 졸려어어어ㅓㅓㅓㅓㅓㅓㅓㅓ ㅠㅠ"

print(f"OKT 형태소 분석: {okt.morphs(corpus)}")
print(f"OKT 품사 tagging: {okt.pos(corpus)}")
print(f"명사 추출: {okt.nouns(corpus)}")

OKT 형태소 분석: ['열심히', '코딩', '한', ',', '당신', ',', '연휴', '에는', '여행', '을', '가봐요', '.', '근데', '졸려어어어', 'ㅓㅓㅓㅓㅓㅓㅓㅓ', 'ㅠㅠ']
OKT 품사 tagging: [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), (',', 'Punctuation'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb'), ('.', 'Punctuation'), ('근데', 'Adverb'), ('졸려어어어', 'Verb'), ('ㅓㅓㅓㅓㅓㅓㅓㅓ', 'KoreanParticle'), ('ㅠㅠ', 'KoreanParticle')]
명사 추출: ['코딩', '당신', '연휴', '여행']


# Lemmatization(표제어 추출)

## 영어

In [29]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["policy", "doing", "organization", "have", "going", "love", "lives", "fly", "dies", "watched", "has", "starting"]

print([lemmatizer.lemmatize(word) for word in words]) # dy, ha, watched

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [30]:
lemmatizer.lemmatize("dies", "v") # 수정

'die'

In [31]:
lemmatizer.lemmatize("watched", "v") # 수정

'watch'

In [32]:
lemmatizer.lemmatize("has", "v") # 수정

'have'

# Stemming(어간 추출)

In [33]:
from nltk.stem import PorterStemmer # 포터
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

sentence = "This was not the map we found in Billy Bones's chest, but and accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
tokenized_sentence = word_tokenize(sentence)

print([stemmer.stem(word) for word in tokenized_sentence]) # s를 그냥 다 없애버리네

['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'and', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [34]:
from nltk.stem import LancasterStemmer # 랭커스터

stemmer = LancasterStemmer()

print([stemmer.stem(word) for word in tokenized_sentence])

['thi', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'bil', 'bon', "'s", 'chest', ',', 'but', 'and', 'acc', 'cop', ',', 'complet', 'in', 'al', 'thing', '--', 'nam', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'exceiv', 'of', 'the', 'red', 'cross', 'and', 'the', 'writ', 'not', '.']


# Stopwords(불용어)

자주 등장하지만 분석을 하는 데 있어 큰 도움이 되지 않는 단어들. 예를 들어, I, my, me, over, 조사, 접미사 ...

In [35]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt

## 영어

In [36]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [37]:
corpus = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words("english"))
word_tokens = TreebankWordTokenizer().tokenize(corpus)
res = [word for word in word_tokens if word not in stop_words]

res

['Family', 'important', 'thing.', 'It', "'s", 'everything', '.']

## 한국어
보통 txt나 csv 형식으로 불용어를 정의함. https://www.ranks.nl/stopwords/korean 참고


# Regular Expression(정규 표현식)

|특수 문자|설명|
|---|---|
|.|한 개의 임의의 문자|
|?a|a가 0개 또는 1개|
|*a|a가 0개 이상|
|+a|a가 1개 이상|
|^a|문자열이 a로 시작|
|a$|문자열이 a로 끝|
|a{m}|a를 m번 반복|
|a{m, n}|a가 m번 이상 n회 이하 반복|
|a{m, }|a가 m번 이상 반복|
|[text]|대괄호 안의 문자들 중 한 개의 문자와 매치|
|[^a]|a를 제외한 문자와 매치|
|l|파이프 문자와 동일한 역할, AlB의 의미|

|문자 규칙|설명|
|---|---|
|\\\ |역 슬래시|
|\\d|모든 숫자, [0-9]와 동일|
|\\D|숫자를 제외한 모든 뭇자, [^0-9]와 동일|
|\\s|모든 공백, [ \t\n\r\f\v]와 동일|
|\\S|공백을 제외한 문자, [^ \t\n\r\f\v]와 동일|
|\\w|문자 또는 숫자, [a-zA-Z0-9]와 동일|
|\\W|문자 또는 숫자가 아닌 문자, [^a-zA-Z0-9]와 동일|

|re 모듈 함수|설명|
|---|---|
|re.compile()|파이썬 인터프리터에게 전해주는 역할|
|re.search()|문자열 전체에 대해 정규 표현식과 매치되는지 검색|
|re.match()|문자열의 처음이 정규 표현식과 매치되는지 검사|
|re.split()|정규 표현식을 기준으로 문자열을 분리하여 list 형태로 반환|
|re.findall()|문자열에서 정규 표현식과 매치되는 모든 경우의 문자열을 찾아 리스트로 반환, 만약 없다면 빈 리스트를 반환|

In [38]:
import re

In [39]:
regex = re.compile("a.c")

print(regex.search("kkk"))
print(regex.search("ac"))
print(regex.search("adc"))
print(regex.search("addc"))

None
None
<re.Match object; span=(0, 3), match='adc'>
None


In [40]:
regex = re.compile("ab?c")

print(regex.search("abc"))
print(regex.search("abcc"))
print(regex.search("abbc"))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='abc'>
None


In [41]:
regex = re.compile("ab*c")

print(regex.search("ab"))
print(regex.search("abc"))
print(regex.search("abcccc"))

None
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='abc'>


In [42]:
regex = re.compile("ab+c")

print(regex.search("ab"))
print(regex.search("abd"))
print(regex.search("abc"))
print(regex.search("abcc"))

None
None
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 3), match='abc'>


In [43]:
regex = re.compile("^ab")

print(regex.search("a"))
print(regex.search("ab"))
print(regex.search("abccc"))

None
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 2), match='ab'>


In [44]:
regex = re.compile("ab$")

print(regex.search("db"))
print(regex.search("cab"))

None
<re.Match object; span=(1, 3), match='ab'>


In [45]:
regex = re.compile("ab{2}c")

print(regex.search("abc"))
print(regex.search("abbc"))
print(regex.search("abbbc"))

None
<re.Match object; span=(0, 4), match='abbc'>
None


In [46]:
regex = re.compile("ab{2,8}c")

print(regex.search("abc"))
print(regex.search("abbc"))
print(regex.search("abbbc"))
print(regex.search("abbbbbbbbbbbbbbc"))

None
<re.Match object; span=(0, 4), match='abbc'>
<re.Match object; span=(0, 5), match='abbbc'>
None


In [47]:
regex = re.compile("ab{2,}c")

print(regex.search("abc"))
print(regex.search("abbc"))
print(regex.search("abbbc"))
print(regex.search("abbbbbbbc"))

None
<re.Match object; span=(0, 4), match='abbc'>
<re.Match object; span=(0, 5), match='abbbc'>
<re.Match object; span=(0, 9), match='abbbbbbbc'>


In [48]:
regex = re.compile("[ab]")

print(regex.search("a"))
print(regex.search("b"))
print(regex.search("ab"))
print(regex.search("abc"))
print(regex.search("dfa"))
print(regex.search("dfc"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='b'>
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(2, 3), match='a'>
None


In [49]:
regex = re.compile("[^ab]")

print(regex.search("a"))
print(regex.search("b"))
print(regex.search("ca"))
print(regex.search("cd"))

None
None
<re.Match object; span=(0, 1), match='c'>
<re.Match object; span=(0, 1), match='c'>
