In [1]:
!pip install -q nltk

WordNet 的維基百科說明： https://zh.wikipedia.org/wiki/WordNet

In [2]:
import nltk

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...


True

In [5]:
from nltk.corpus import wordnet as wn

### 查詢car

In [6]:
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

這裡輸出了五個不同的synset (同義詞集合), 表示car 有五種不同的意思(不同的同義詞群組)

In [7]:
car = wn.synset('car.n.01')

In [8]:
car.definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [9]:
car_2 = wn.synset('car.n.02')

In [10]:
car_2.definition()

'a wheeled vehicle adapted to the rails of railroad'

In [11]:
car.lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

利用lemma_name 可以了解 car 這個字，把auto, automobile, machine, motocar 定義為同義詞

### 接下來看 "上位詞"

In [12]:
car.hypernym_paths()[0]

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('artifact.n.01'),
 Synset('instrumentality.n.03'),
 Synset('container.n.01'),
 Synset('wheeled_vehicle.n.01'),
 Synset('self-propelled_vehicle.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('car.n.01')]

愈往上，愈抽象；愈往下，愈具體。

接下來看 Wrodnet的詞意相似度

In [13]:
car = wn.synset('car.n.01')

In [14]:
novel = wn.synset('novel.n.01')

In [15]:
dog = wn.synset('dog.n.01')

In [16]:
motorcycle = wn.synset('motorcycle.n.01')

In [17]:
car.path_similarity(novel)

0.05555555555555555

In [18]:
car.path_similarity(dog)

0.07692307692307693

In [19]:
car.path_similarity(motorcycle)

0.3333333333333333

In [20]:
cat = wn.synset('cat.n.01')

In [21]:
car.path_similarity(cat)

0.05555555555555555

In [22]:
covid-19 = wn.synset('covid-19')

SyntaxError: cannot assign to expression here. Maybe you meant '==' instead of '='? (1745940456.py, line 1)

### 分詞

In [23]:
from nltk.tokenize import word_tokenize

In [24]:
# https://www.space.com/norad-tracks-santa-claus-trip-to-international-space-station
string = "NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different."

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
word_tokenize(string)

['NORAD',
 'regularly',
 'tracks',
 'Santa',
 "'s",
 'trip',
 'around',
 'the',
 'world',
 'each',
 'Christmas',
 ',',
 'but',
 'this',
 'year',
 'is',
 'a',
 'bit',
 'different',
 '.']

### 詞性

##### 詞性說明: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [27]:
from nltk import pos_tag

In [28]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [29]:
pos_tag(word_tokenize(string))

[('NORAD', 'NNP'),
 ('regularly', 'RB'),
 ('tracks', 'VBZ'),
 ('Santa', 'NNP'),
 ("'s", 'POS'),
 ('trip', 'NN'),
 ('around', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('each', 'DT'),
 ('Christmas', 'NNP'),
 (',', ','),
 ('but', 'CC'),
 ('this', 'DT'),
 ('year', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bit', 'RB'),
 ('different', 'JJ'),
 ('.', '.')]

### 做 stemming

In [30]:
from nltk.stem.porter import PorterStemmer

In [31]:
porter = PorterStemmer()

In [32]:
[porter.stem(word) for word in word_tokenize(string)]

['norad',
 'regularli',
 'track',
 'santa',
 "'s",
 'trip',
 'around',
 'the',
 'world',
 'each',
 'christma',
 ',',
 'but',
 'thi',
 'year',
 'is',
 'a',
 'bit',
 'differ',
 '.']

In [33]:
pos_tag([porter.stem(word) for word in word_tokenize(string)])

[('norad', 'JJ'),
 ('regularli', 'NN'),
 ('track', 'NN'),
 ('santa', 'NN'),
 ("'s", 'POS'),
 ('trip', 'NN'),
 ('around', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('each', 'DT'),
 ('christma', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('thi', 'JJ'),
 ('year', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bit', 'NN'),
 ('differ', 'NN'),
 ('.', '.')]

### 分句

In [34]:
strs = "NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different. On Wednesday (Dec. 23), the Federal Aviation Administration gave Santa and his reindeer-powered sleigh an official commercial space license for launches and landings"

In [35]:
from nltk.tokenize import sent_tokenize

In [36]:
sent_tokenize(strs)

["NORAD regularly tracks Santa's trip around the world each Christmas, but this year is a bit different.",
 'On Wednesday (Dec. 23), the Federal Aviation Administration gave Santa and his reindeer-powered sleigh an official commercial space license for launches and landings']