In [1]:
import nltk
from nltk.tag import pos_tag
nltk.download("averaged_perceptron_tagger")

tokenised_sent = ["their", "decision", "makes", "no", "economic", "sense"]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\johns\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
#標註詞性
pos_tagged=pos_tag(tokenised_sent)
print(pos_tagged)

[('their', 'PRP$'), ('decision', 'NN'), ('makes', 'VBZ'), ('no', 'DT'), ('economic', 'JJ'), ('sense', 'NN')]


In [3]:
#語幹提取(stemming)
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer #引入三種stemming 演算法
tokens = ["the", "spectators", "all", "stood", "and", "sang", "the", "national", "anthem"]

port=PorterStemmer()
stemmed_port=[port.stem(t) for t in tokens]

lanca=LancasterStemmer()
stemmed_lanca=[lanca.stem(t) for t in tokens]

snow=SnowballStemmer('english')
stemmed_snow=[snow.stem(t) for t in tokens]

print(stemmed_port)
print(stemmed_lanca)
print(stemmed_snow)
#stemming 是把字尾去掉，只剩下詞幹。這樣的機制難以處理多種詞的變化形態，包括過去式(如 sing=>sang)

['the', 'spectat', 'all', 'stood', 'and', 'sang', 'the', 'nation', 'anthem']
['the', 'spect', 'al', 'stood', 'and', 'sang', 'the', 'nat', 'anthem']
['the', 'spectat', 'all', 'stood', 'and', 'sang', 'the', 'nation', 'anthem']


In [4]:

#詞形還原（Lemmatisation）
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

def form_speech_dict(pos):
    t={}
    for i in pos:
        t[i[0]]=i[1]
    return t
def get_part_of_speech(pose):
    w=form_speech_dict(pose)
    a=[]
    for i in pose:
        if w[i[0]].startswith('J'):
            a.append(wordnet.ADJ)
        elif w[i[0]].startswith('V'):
            a.append(wordnet.VERB)
        elif w[i[0]].startswith('N'):
            a.append(wordnet.NOUN)
        elif w[i[0]].startswith('R'):
            a.append(wordnet.ADV)
        else:
            a.append(None)
    return a
print(pos_tag(tokens))
lemmatiser = WordNetLemmatizer()
lemmatised = [lemmatiser.lemmatize(token) for token in tokens]
print(lemmatised)
#以上程式無法有效還原，但以下程式可以
s=get_part_of_speech(pos_tag(tokens))
result_sen=[]
for i in range(len(s)):
    if s[i]!=None:
        result_sen.append(lemmatiser.lemmatize(tokens[i],pos=s[i]))
    else:
        result_sen.append(lemmatiser.lemmatize(tokens[i]))
print(result_sen)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\johns\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\johns\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[('the', 'DT'), ('spectators', 'NNS'), ('all', 'DT'), ('stood', 'NN'), ('and', 'CC'), ('sang', 'VBD'), ('the', 'DT'), ('national', 'JJ'), ('anthem', 'NN')]
['the', 'spectator', 'all', 'stood', 'and', 'sang', 'the', 'national', 'anthem']
['the', 'spectator', 'all', 'stood', 'and', 'sing', 'the', 'national', 'anthem']
