In [5]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('wordnet')
nltk.download('punkt')


## resource:https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

## Create stemmer & Lemmatizer
stemmer=PorterStemmer()
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### example

In [3]:
print('Stemming amusing : {}'.format(stemmer.stem('amusing')))
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v')))

Stemming amusing : amus
lemmatization amusing : amuse


### 運用tokenize技巧結合stemming提取每個單詞的詞幹

In [8]:
# Define the sentence to be lemmatized
sentence = "We went out often, hiding from sight, desperately searching for food."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']

stemming_output = ' '.join([ps.stem(w) for w in word_list])
print(stemming_output)
#> We went out often , hide from sight , desper search for food.

['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']
We went out often , hide from sight , desper search for food .


### 運用tokenize技巧結合lemmatize提取每個單詞的lemma

In [9]:
# Define the sentence to be lemmatized
sentence = "We went out often, hiding from sight, desperately searching for food."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)
#> We went out often , hiding from sight , desperately searching for food .

['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']
We went out often , hiding from sight , desperately searching for food .


### 有時單詞的lemma會隨著詞性而有所改變

In [10]:
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v'))) ##動詞
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'a'))) ##形容詞

lemmatization amusing : amuse
lemmatization amusing : amusing


### 運用pos_tag技巧結合lemmatize提取每個單詞的lemma

In [14]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [16]:
word = 'using'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

use


### Lemmatize 字串中每個單詞並加入 POS tag

In [17]:
sentence = "We went out often, hiding from sight, desperately searching for food."
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
#> ['We', 'go', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'search', 'for', 'food', '.']

['We', 'go', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'search', 'for', 'food', '.']
