In [1]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
from collections import Counter, defaultdict
from more_itertools import unique_everseen
import re

In [3]:
'''WordNet is a semantically-oriented dictionary of English, similar to a traditional thesaurus but 
with a richer structure. NLTK includes the English WordNet, with 155,287 words and 117,659 synonym sets. 
We'll begin by looking at synonyms and how they are accessed in WordNet.'''

print(wn.synsets('car'))


[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]


In [4]:
for synset in wn.synsets('car'):
    print(synset.lemma_names())

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']


In [5]:
wn.synset('car.n.01').definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [6]:
wn.synset('car.n.01').examples()

['he needs a car to get to work']

In [7]:
'''Navigate between concepts with hyponyms'''
car = wn.synset('car.n.01')
types_of_cars = car.hyponyms()
print(sorted(lemma.name() for synset in types_of_cars for lemma in synset.lemmas()))

['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', 'ambulance', 'beach_waggon', 'beach_wagon', 'bus', 'cab', 'compact', 'compact_car', 'convertible', 'coupe', 'cruiser', 'electric', 'electric_automobile', 'electric_car', 'estate_car', 'gas_guzzler', 'hack', 'hardtop', 'hatchback', 'heap', 'horseless_carriage', 'hot-rod', 'hot_rod', 'jalopy', 'jeep', 'landrover', 'limo', 'limousine', 'loaner', 'minicar', 'minivan', 'pace_car', 'patrol_car', 'phaeton', 'police_car', 'police_cruiser', 'prowl_car', 'race_car', 'racer', 'racing_car', 'roadster', 'runabout', 'saloon', 'secondhand_car', 'sedan', 'sport_car', 'sport_utility', 'sport_utility_vehicle', 'sports_car', 'squad_car', 'station_waggon', 'station_wagon', 'stock_car', 'subcompact', 'subcompact_car', 'taxi', 'taxicab', 'tourer', 'touring_car', 'two-seater', 'used-car', 'waggon', 'wagon']


In [8]:
'''You can also navigate up the hierarchy by visiting hypernyms. 
Some words have multiple paths, because they can be classified in more than one way. 
There are two paths between car.n.01 and entity.n.01 because 
wheeled_vehicle.n.01 can be classified as both a vehicle and a container.'''

print(car.hypernyms())
paths = car.hypernym_paths()
len(paths)


[Synset('motor_vehicle.n.01')]


2

In [9]:
print([synset.name() for synset in paths[0]])
print('---------------------------------------')
print([synset.name() for synset in paths[1]])

['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03', 'container.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']
---------------------------------------
['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03', 'conveyance.n.03', 'vehicle.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']


In [10]:
'''We can get the most general hypernyms (or root hypernyms) of a synset as follows'''
car.root_hypernyms()

[Synset('entity.n.01')]

In [11]:
'''More Lexical Relations'''
wn.synset('tree.n.01').part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [12]:
#Meronyms
wn.synset('tree.n.01').substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [13]:
#Holonyms
wn.synset('tree.n.01').member_holonyms()

[Synset('forest.n.01')]

In [14]:
for synset in wn.synsets('mint', wn.NOUN):
    print(synset.name() + ':', synset.definition())

batch.n.02: (often followed by `of') a large number or amount or extent
mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers
mint.n.03: any member of the mint family of plants
mint.n.04: the leaves of a mint plant used fresh or candied
mint.n.05: a candy that is flavored with a mint oil
mint.n.06: a plant where money is coined by authority of the government


In [15]:
print(wn.synset('mint.n.04').part_holonyms())
print(wn.synset('mint.n.04').substance_holonyms())

[Synset('mint.n.02')]
[Synset('mint.n.05')]


In [16]:
#Entailment
print(wn.synset('walk.v.01').entailments())
#print(wn.synset('eat.v.01').entailments())

[Synset('step.v.01')]


In [17]:
#Antonyms
print(wn.lemma('supply.n.02.supply').antonyms())
print(wn.lemma('rush.v.01.rush').antonyms())
print(wn.lemma('horizontal.a.01.horizontal').antonyms())
print(wn.lemma('staccato.r.01.staccato').antonyms())


[Lemma('demand.n.02.demand')]
[Lemma('linger.v.04.linger')]
[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]
[Lemma('legato.r.01.legato')]


In [18]:
'''Semantic Similarity'''
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')
print("Minke hypernym:")
print(right.lowest_common_hypernyms(minke))
print("Orca: hypernym: ")
print(right.lowest_common_hypernyms(orca))
print("Tortoise hypernym: ")
print(right.lowest_common_hypernyms(tortoise))
print("Novel hypernym: ")
print(right.lowest_common_hypernyms(novel))

Minke hypernym:
[Synset('baleen_whale.n.01')]
Orca: hypernym: 
[Synset('whale.n.02')]
Tortoise hypernym: 
[Synset('vertebrate.n.01')]
Novel hypernym: 
[Synset('entity.n.01')]


In [19]:
#synset depth
wn.synset('baleen_whale.n.01').min_depth()

#wn.synset('whale.n.02').min_depth()

#wn.synset('vertebrate.n.01').min_depth()

#wn.synset('entity.n.01').min_depth()


14

In [20]:
'''Similarity Scores'''
right.path_similarity(minke)

# right.path_similarity(orca)

# right.path_similarity(tortoise)

# right.path_similarity(novel)

0.25

In [21]:
'''Example: Using WordNet to find organisms in biology texts'''

senses = (wn.synsets('plant', pos="n"))
print(senses)

[Synset('plant.n.01'), Synset('plant.n.02'), Synset('plant.n.03'), Synset('plant.n.04')]


In [22]:
s1 = senses[0]
s2 = senses[1]
s3 = senses[2]
s4 = senses[3]
d1 = (s1.definition()) 
print("Def 1: " + str(d1))
d2 = (s2.definition())
print("Def 2: " + str(d2))
d3 = (s3.definition())
print("Def 3: " + str(d3))
d4 = (s4.definition()) 
print("Def 4: " + str(d4))

Def 1: buildings for carrying on industrial labor
Def 2: (botany) a living organism lacking the power of locomotion
Def 3: an actor situated in the audience whose acting is rehearsed but seems spontaneous to the audience
Def 4: something planted secretly for discovery by another


In [24]:
#Load text
def readMe(file):
    raw_text = open(file, 'r')
    raw_text = raw_text.read()
    return raw_text

text = readMe("/Users/hclent/Desktop/NLTK-tutorial/texts/18952863_4.txt")
#print(fcorpus)


In [25]:
def formatCorpus(corpus): #as string
    corpus = corpus.replace("_", "underscore") #underscores will make problems for buildDict and are unimportant for my NER
    #make untagged corpus
    untagged_corpus = corpus.lower()
    #untagged_corpus = untagged_corpus.split() #tokenize
    untagged_corpus = nltk.word_tokenize(untagged_corpus)
    stopwords = nltk.corpus.stopwords.words('english') #delete stopwords from untagged
    untagged_corpus = [w for w in untagged_corpus if w.lower() not in stopwords]


    return untagged_corpus

untagged_corpus = formatCorpus(text)
#print(untagged_corpus)


In [26]:
#Use WordNet to find organisms
def wordNetNER(document):
    plant_sns = (wn.synsets('plant', pos="n"))
    plant = plant_sns[1] #(botany) a living organism lacking the power of locomotion #hardcoded

    wordnet_names = []
    
    for word in document:

        mySynsets = wn.synsets(word, pos="n") #look at nouns


        i = 0
        for i in range(0, 3):
            try:
                given_word = mySynsets[i] #tries first 3 synsets
                definition = (given_word.definition())
                p1 = re.compile('plant(s?)\s')
                p2 = re.compile('organism(s?)\s')
                p3 = re.compile('animal(s?)\s')
                match1 = p1.search(definition)
                match2 = p2.search(definition)
                match3 = p3.search(definition)

                if match1 or match2 or match3:  #if word has "plants" or "animals" in the def, check how similar"
                    similarity_score = (given_word.path_similarity(plant)) #check similarity score
                    if similarity_score >= 0.2:
                        #print(similarity_score)
                        #print ("The words: "+(str(given_word)) + "  has a sim score of:  " +str(similarity_score))
                        wordnet_names.append(word)
                i += 1
            except IndexError:
                pass

    wordnet_ner = (list(unique_everseen(wordnet_names)))
    return wordnet_ner

wordnet_names = wordNetNER(untagged_corpus)


In [27]:
print("WORDNET: ")
print(wordnet_names)

WORDNET: 
['lupin', 'legumes', 'bacteria', 'clones', 'simple', 'legume', 'plant', 'bean', 'cell', 'cells', 'crops', 'plants', 'clone', 'peanut', 'recombinant', 'lupins', 'eukaryotes', 'diploid', 'animal']
