In [1]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import nltk

## 創建 Stemmer
ps = PorterStemmer()

## 創建 Lemmatizer
lemmatizer = WordNetLemmatizer() 

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/jiaping/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

---
### Example

In [3]:
print('Stemming amusing : {}'.format(ps.stem('amusing')))
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v')))

Stemming amusing : amus
lemmatization amusing : amuse


---
### Tokenize + Stemming 提取每個單詞的詞幹

In [4]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [7]:
stemming_output = ' '.join([ps.stem(w) for w in word_list])
print(stemming_output)

the stripe bat are hang on their feet for best


### Tokenize + Lemmatize 提取每個單詞的 Lemma

In [8]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [9]:
# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)
#> The striped bat are hanging on their foot for best

The striped bat are hanging on their foot for best


---
### 有時單詞的 Lemma 會隨著詞性而有所改變

In [10]:
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v'))) ##動詞
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'a'))) ##形容詞

lemmatization amusing : amuse
lemmatization amusing : amusing


---
### pos_tag + Lemmatize 提取每個單詞的 Lemma

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jiaping/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
word = 'using'
tag = nltk.pos_tag(word)
tag

[('u', 'JJ'), ('s', 'NN'), ('i', 'NN'), ('n', 'VBP'), ('g', 'NN')]

In [16]:
word = 'using'
tag = nltk.pos_tag([word]) # 必須使用 list
tag

[('using', 'VBG')]

In [20]:
print(tag[0])
print(tag[0][1])
print(tag[0][1][0]) # 抓出代表詞性的第一個字母

('using', 'VBG')
VBG
V


In [22]:
from nltk.corpus import wordnet
tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
}

In [23]:
tag_dict

{'J': 'a', 'N': 'n', 'V': 'v', 'R': 'r'}

In [28]:
# tag_dict[pos_tag] if pos_tag in tag_dict, else wordnet.NOUN

tag_dict.get(tag[0][1][0], wordnet.NOUN)

'v'

In [29]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    
    """將 pos_tag 結果 mapping 到 lemmatizer 中 pos 的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }

    return tag_dict.get(tag, wordnet.NOUN)

In [30]:
word = 'using'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

use


### Lemmatize 字串中每個單詞並加入 POS tag

In [33]:
sentence = "The striped bats are hanging on their feet for best"

print([lemmatizer.lemmatize(w, pos=get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]) 

['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
