### 표제어 추출

In [2]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/chy/nltk_data...


True

In [3]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ['policy', 'doing', 'organization', 'have', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print('Lemmatizer: ',[lemmatizer.lemmatize(w) for w in words] )

Lemmatizer:  ['policy', 'doing', 'organization', 'have', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [4]:
lemmatizer.lemmatize('dies','v')

'die'

In [5]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

sentence = "This was not the map we found in Billy Bones's chest, \
            but an accurate copy, complete in all things--names and heights and soundings--with \
            the single exception of the red crosses and the written notes."

tokenized_sentence = word_tokenize(sentence)

print('After stemming: ', [stemmer.stem(w) for w in tokenized_sentence])

After stemming:  ['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [6]:
words = ['formalize', 'allowance', 'electricical']

print('result : ', [stemmer.stem(w) for w in words])

result :  ['formal', 'allow', 'electric']


### 불용어 (stopword)

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt

In [3]:
# nltk 에서 불용어 확인
stop_words_list = stopwords.words('english')
print('num of stopwords: ', len(stop_words_list))
print('examples of stopwords: ', stop_words_list[:10])

num of stopwords:  179
examples of stopwords:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [5]:
# 불용어 제거 예시
example = "Drank too much, got the sickness. Pray to God and his son for forgiveness."
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example)

result = []
for w in word_tokens:
    if w not in stop_words:
        result.append(w)

print('불용어 제거 전: ', word_tokens)
print('불용어 제거 후: ', result)


불용어 제거 전:  ['Drank', 'too', 'much', ',', 'got', 'the', 'sickness', '.', 'Pray', 'to', 'God', 'and', 'his', 'son', 'for', 'forgiveness', '.']
불용어 제거 후:  ['Drank', 'much', ',', 'got', 'sickness', '.', 'Pray', 'God', 'son', 'forgiveness', '.']


In [6]:
# 한국어에서의 불용어 제거
okt = Okt()
example = "잠깐 시간 될까 만날 수 있을까? 별일은 아니고 그냥 보고 싶어 그래."
stop_words = "까 은 수 그냥"

stop_words = set(stop_words.split(' '))
word_tokens = okt.morphs(example)

result = [w for w in word_tokens if not w in stop_words]

print('불용어 제거 전: ', word_tokens)
print('불용어 제거 후: ', result)

불용어 제거 전:  ['잠깐', '시간', '될까', '만날', '수', '있을까', '?', '별일', '은', '아니고', '그냥', '보고', '싶어', '그래', '.']
불용어 제거 후:  ['잠깐', '시간', '될까', '만날', '있을까', '?', '별일', '아니고', '보고', '싶어', '그래', '.']
