<a href="https://colab.research.google.com/github/jumbokh/nknu-class/blob/main/NLP/notebooks/11_03_%E5%AD%97%E8%A9%9E%E5%89%8D%E7%BD%AE%E8%99%95%E7%90%86.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 字詞前置處理

In [None]:
# 載入相關套件
import nltk

In [None]:
# 測試文章段落
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."

## 分割字句

In [None]:
# 分割字句
nltk.sent_tokenize(text)

['Today is a great day.',
 'It is even better than yesterday.',
 'And yesterday was the best day ever.']

## 分詞

In [None]:
# 分詞
nltk.word_tokenize(text)

['Today',
 'is',
 'a',
 'great',
 'day',
 '.',
 'It',
 'is',
 'even',
 'better',
 'than',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'was',
 'the',
 'best',
 'day',
 'ever',
 '.']

## 詞形還原

In [None]:
# 字根詞形還原(Stemming)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
ps = nltk.porter.PorterStemmer()
' '.join([ps.stem(word) for word in text.split()])

'my system keep crash hi crash yesterday, our crash daili'

In [None]:
# 依字典規則的詞形還原(Lemmatization)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'
lem = nltk.WordNetLemmatizer()
' '.join([lem.lemmatize(word) for word in text.split()])

'My system keep crashing his crashed yesterday, ours crash daily'

## 停用詞(Stopwords)

In [None]:
# 標點符號(Punctuation)
import string
print('標點符號:', string.punctuation)

# 測試文章段落
text="Today is a great day. It is even better than yesterday." + \
     " And yesterday was the best day ever."
# 讀取停用詞
stopword_list = set(nltk.corpus.stopwords.words('english') 
                    + list(string.punctuation))

# 移除停用詞(Removing Stopwords)
def remove_stopwords(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text, filtered_tokens

filtered_text, filtered_tokens = remove_stopwords(text) 
filtered_text

標點符號: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'Today great day It even better yesterday And yesterday best day ever'

## BOW 測試

In [None]:
# 測試文章段落
with open('../NLP_data/news.txt','r+', encoding='UTF-8') as f:
    text = f.read()

filtered_text, filtered_tokens = remove_stopwords(text, True) 

import collections
# 生字表的集合
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))         

[('’', 35), ('stores', 15), ('convenience', 14), ('one', 8), ('—', 8), ('even', 8), ('seoul', 8), ('city', 7), ('korea', 6), ('korean', 6), ('cities', 6), ('people', 5), ('summer', 4), ('new', 4), ('also', 4), ('find', 4), ('store', 4), ('would', 4), ('like', 4), ('average', 4)]


In [None]:
# 移除停用詞(Removing Stopwords)
lem = nltk.WordNetLemmatizer()
def remove_stopwords_regex(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # 篩選文數字(Alphanumeric)
    tokens = tokenizer.tokenize(text)
    tokens = [lem.lemmatize(token.strip()) for token in tokens] # 詞形還原
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text, filtered_tokens

filtered_text, filtered_tokens = remove_stopwords_regex(text, True) 
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))         

[('store', 19), ('convenience', 14), ('city', 13), ('one', 8), ('even', 8), ('seoul', 8), ('korea', 6), ('korean', 6), ('night', 6), ('food', 5), ('ha', 5), ('people', 5), ('summer', 4), ('new', 4), ('life', 4), ('also', 4), ('find', 4), ('would', 4), ('like', 4), ('chain', 4)]


In [None]:
lem.lemmatize('korean')

'korean'

## 相似詞(Synonyms)

In [None]:
# 找出相似詞(Synonyms)
synonyms = nltk.corpus.wordnet.synsets('love')
synonyms

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [None]:
# 單字說明
synonyms[0].definition()

'a strong positive emotion of regard and affection'

In [None]:
# 單字的例句
synonyms[0].examples()

['his love for his work', 'children need a lot of love']

## 相反詞(Antonyms)
#### 必須呼叫 lemmas 進行詞型還原，再呼叫 antonyms

In [None]:
# 找出相反詞(Antonyms)
antonyms=[]
for syn in nltk.corpus.wordnet.synsets('ugly'):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms

['beautiful']

## 詞性標籤(POS Tagging)
#### 依照句子結構，顯示每個單字的詞性。

In [None]:
# 找出詞性標籤(POS Tagging)
text='I am a human being, capable of doing terrible things'
sentences=nltk.sent_tokenize(text)
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('human', 'JJ'), ('being', 'VBG'), (',', ','), ('capable', 'JJ'), ('of', 'IN'), ('doing', 'VBG'), ('terrible', 'JJ'), ('things', 'NNS')]


### POS Tagging: https://baike.baidu.com/item/%E8%AF%8D%E6%80%A7%E6%A0%87%E6%B3%A8/2783103
### https://verbs.colorado.edu/chinese/posguide.3rd.ch.pdf