In [4]:
# !pip install nltk

from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TreebankWordTokenizer

import nltk
import re

In [6]:
sentence = 'I was wondering if anyone out there could enlighten me on this car.'

# 문장을 처리하는 방법 1
# 2자 이내의 짧은 단어를 삭제합니다.
shortword = re.compile(r'\W*\b\w{1,2}\b')
data = shortword.sub('', sentence)

print('짧은 단어 삭제:', data)

# 문장을 처리하는 방법 2
# 단어를 기반으로 토큰화를 해봅시다.
tokenizer = RegexpTokenizer('[\w]+')
sentence = 'Time is an illusion. Lunchtime double so!'
tokens = tokenizer.tokenize(sentence)

print('\n단어 토큰화 1:', tokens)

sentence = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
tokens = tokenizer.tokenize(sentence)
print('\n단어 토큰화 2:', tokens)

짧은 단어 삭제:  was wondering anyone out there could enlighten this car.

단어 토큰화 1: ['Time', 'is', 'an', 'illusion', 'Lunchtime', 'double', 'so']

단어 토큰화 2: ['Starting', 'a', 'home', 'based', 'restaurant', 'may', 'be', 'an', 'ideal', 'it', 'doesn', 't', 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own']


In [8]:
# 품사 분석을 위한 태그를 다운받습니다.
nltk.download('averaged_perceptron_tagger')

tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentence)
postag = pos_tag(tokens)

print('트리뱅크 워드토크나이저 :', tokens)
print('품사 태깅 :', postag)

noun_list = [x for (x, y) in postag if y == 'NN']
noun_list

트리뱅크 워드토크나이저 : ['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']
품사 태깅 : [('Starting', 'VBG'), ('a', 'DT'), ('home-based', 'JJ'), ('restaurant', 'NN'), ('may', 'MD'), ('be', 'VB'), ('an', 'DT'), ('ideal.', 'NN'), ('it', 'PRP'), ('does', 'VBZ'), ("n't", 'RB'), ('have', 'VB'), ('a', 'DT'), ('food', 'NN'), ('chain', 'NN'), ('or', 'CC'), ('restaurant', 'NN'), ('of', 'IN'), ('their', 'PRP$'), ('own', 'JJ'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nsun5\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['restaurant', 'ideal.', 'food', 'chain', 'restaurant']

In [10]:
stemmer = PorterStemmer()
tokenizer = TreebankWordTokenizer()

sentence = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things with the single exception of the red crosses and the written notes."
tokens = tokenizer.tokenize(sentence)

# tokens를 순회하며 어간 추출 
data = [stemmer.stem(data) for data in tokens]

print('어간 추출 전 :', tokens)
print('\n어간 추출 후 :', data)
print('\n=====================\n')

# 다른 어간추출기와 비교
ps = PorterStemmer()
ls = LancasterStemmer()

wordlist = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

porterdata = [ps.stem(data) for data in wordlist]
lancasterdata = [ls.stem(data) for data in wordlist]

print('어간 추출 전 :', wordlist)
print('\n포터 스테머의 어간 추출 후:',porterdata)
print('\n랭커스터 스테머의 어간 추출 후:',lancasterdata)

어간 추출 전 : ['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']

어간 추출 후 : ['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


어간 추출 전 : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

포터 스테머의 어간 추출 후: ['polici', 'do', 'organ', 'have', 'go', 'love', 'live', 'fli', 'die', 'watch', 'ha', 'start']

랭커스터 스테머의 어간 추출 후: ['policy', 'doing', 'org', 'hav', 'going', 'lov', 'liv', 'fly', 'die', 'watch', 'has', 'start']
