In [None]:
# 'NLP in Action'
# 2.2.3 토큰 개선

In [16]:
import re

In [40]:
sentence = """Thomas Jefferson began building Monticello at the age of 26."""

In [47]:
tokens = re.split(r'[-\s.,;!?]+', sentence)

In [48]:
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [33]:
pattern = re.compile(r'([-\s.,;!?])+')

In [34]:
tokens = pattern.split(sentence)

In [35]:
tokens[-10:]

[' ', 'the', ' ', 'age', ' ', 'of', ' ', '26', '.', '']

In [36]:
[x for x in tokens if x and x not in '- \t\n.,!?']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [37]:
list(filter(lambda x: x if x and x not in '- \t\n.,!?' else None, tokens))

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [2]:
from nltk.tokenize import RegexpTokenizer

In [5]:
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+') # S 대문자여야 공백 제거한다!!

In [6]:
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [7]:
from nltk.tokenize import TreebankWordTokenizer

In [11]:
sentence = """Monticello wasn't designated as UNESCO World Heritage Stie until 1987."""

In [12]:
toknenizer = TreebankWordTokenizer()

In [13]:
tokenizer.tokenize(sentence) # wansn't => wasn, 't로 분리되는데 책에서는 was, n't?

['Monticello',
 'wasn',
 "'t",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Stie',
 'until',
 '1987',
 '.']

In [14]:
# 2.2.4 n-그램을 이용한 어휘 확장

In [29]:
tokenize_2grams("Thomas Jefferson began building Monticello at the age of 26.") # not defined?

NameError: name 'tokenize_2grams' is not defined

In [54]:
sentence = """Thomas Jefferson began building Monticello at the age of 26."""
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [55]:
from nltk.util import ngrams

In [56]:
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [57]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [58]:
two_grams = list(ngrams(tokens, 2))

In [59]:
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [4]:
# 불용어
import nltk

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaek\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [71]:
stop_words= nltk.corpus.stopwords.words('english')

In [72]:
len(stop_words)

179

In [74]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [67]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

In [68]:
len(sklearn_stop_words)

318

In [75]:
len(stop_words)

179

In [88]:
stop_words = set(stop_words) # 교재에는 없지만, union() 쓰려면, set으로 바꿔줘야 하는듯;
len(stop_words.union(sklearn_stop_words))

378

In [89]:
len(stop_words.intersection(sklearn_stop_words))

119

In [90]:
tokens= ['House', 'Visitor', 'Center']

In [91]:
normalized_tokens = [x.lower() for x in tokens]

In [92]:
print(normalized_tokens)

['house', 'visitor', 'center']


In [96]:
# Stemming
def stem(phrase):
    return ' '.join([re.findall('^(.ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

In [97]:
stem('houses')

'house'

In [98]:
stem("Docter house's calls")

'docter house call'

In [16]:
# Porter Stemmer
from nltk.stem.porter import PorterStemmer

In [18]:
stemmer= PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()]) 
# PorterStemmer는 후행 '를 유지하기 때문에 명시적으로 제거함

'dish washer wash dish'

In [20]:
# Lemmatization
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
lemmatizer.lemmatize("better")

'better'

In [11]:
lemmatizer.lemmatize("good", pos="a")

'good'

In [12]:
lemmatizer.lemmatize("goods", pos="a")

'goods'

In [13]:
lemmatizer.lemmatize("goods", pos="n")

'good'

In [14]:
lemmatizer.lemmatize("goodness", pos="n")

'goodness'

In [15]:
lemmatizer.lemmatize("best", pos="a")

'best'

In [19]:
stemmer.stem('goodness')

'good'