In [1]:
from nltk.corpus import wordnet as wn

In [2]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [3]:
dog = wn.synsets('dog')[0]

In [8]:
dog.lemma_names()

[u'dog', u'domestic_dog', u'Canis_familiaris']

In [16]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def lemmatize(syn):
    return syn.lemma_names()

In [11]:
[syn.lemma_names() for syn in dog.hypernyms()]

[[u'canine', u'canid'], [u'domestic_animal', u'domesticated_animal']]

In [12]:
dog.hyponyms()

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [17]:
flatten([lemmatize(syn) for syn in dog.hyponyms()])

[u'basenji',
 u'corgi',
 u'Welsh_corgi',
 u'cur',
 u'mongrel',
 u'mutt',
 u'dalmatian',
 u'coach_dog',
 u'carriage_dog',
 u'Great_Pyrenees',
 u'griffon',
 u'Brussels_griffon',
 u'Belgian_griffon',
 u'hunting_dog',
 u'lapdog',
 u'Leonberg',
 u'Mexican_hairless',
 u'Newfoundland',
 u'Newfoundland_dog',
 u'pooch',
 u'doggie',
 u'doggy',
 u'barker',
 u'bow-wow',
 u'poodle',
 u'poodle_dog',
 u'pug',
 u'pug-dog',
 u'puppy',
 u'spitz',
 u'toy_dog',
 u'toy',
 u'working_dog']

In [23]:
print dog.member_holonyms()
print dog.substance_holonyms()
print dog.part_holonyms()

[Synset('canis.n.01'), Synset('pack.n.06')]
[]
[]


In [24]:
print dog.member_meronyms()
print dog.substance_meronyms()
print dog.part_meronyms()

[]
[]
[Synset('flag.n.07')]


In [26]:
print dog.similar_tos()
print dog.topic_domains()
print dog.region_domains()

[]
[]
[]


In [55]:
# ALL related wordnet words

def get_related_wordnet_lemmas(syn):
    '''
    Given a synset 'syn', return all lemmas of that synset, as well as all lemmas of *related* synsets
    '''  

    relatedness_methods = ['hypernyms','instance_hypernyms','hyponyms','instance_hyponyms','member_holonyms','substance_holonyms','part_holonyms','member_meronyms','substance_meronyms','part_meronyms','topic_domains','region_domains','usage_domains','attributes','entailments','causes','also_sees','verb_groups','similar_tos']
    
    lemmas = set(lemmatize(syn))
    for method in relatedness_methods:
        related_synsets = getattr(syn, method)()
        lemmas.update(flatten([lemmatize(s) for s in related_synsets]))
        
    return lemmas

In [56]:
get_related_wordnet_lemmas(dog)

{u'Belgian_griffon',
 u'Brussels_griffon',
 u'Canis',
 u'Canis_familiaris',
 u'Great_Pyrenees',
 u'Leonberg',
 u'Mexican_hairless',
 u'Newfoundland',
 u'Newfoundland_dog',
 u'Welsh_corgi',
 u'barker',
 u'basenji',
 u'bow-wow',
 u'canid',
 u'canine',
 u'carriage_dog',
 u'coach_dog',
 u'corgi',
 u'cur',
 u'dalmatian',
 u'dog',
 u'doggie',
 u'doggy',
 u'domestic_animal',
 u'domestic_dog',
 u'domesticated_animal',
 u'flag',
 u'genus_Canis',
 u'griffon',
 u'hunting_dog',
 u'lapdog',
 u'mongrel',
 u'mutt',
 u'pack',
 u'pooch',
 u'poodle',
 u'poodle_dog',
 u'pug',
 u'pug-dog',
 u'puppy',
 u'spitz',
 u'toy',
 u'toy_dog',
 u'working_dog'}

In [57]:
sad = wn.synsets('sad')[0]
get_related_wordnet_lemmas(sad)

{u'bittersweet',
 u'doleful',
 u'heavyhearted',
 u'melancholic',
 u'melancholy',
 u'mournful',
 u'pensive',
 u'sad',
 u'tragic',
 u'tragical',
 u'tragicomic',
 u'tragicomical',
 u'wistful'}

In [122]:
sad = wn.synsets('hungry')[0]
get_related_wordnet_lemmas(sad)

{u'empty',
 u'empty-bellied',
 u'esurient',
 u'famished',
 u'hungry',
 u'peckish',
 u'ravenous',
 u'sharp-set',
 u'starved',
 u'supperless'}

In [58]:
monster = wn.synsets('monster')[0]
get_related_wordnet_lemmas(monster)

{u'bogeyman',
 u'booger',
 u'boogeyman',
 u'bugaboo',
 u'bugbear',
 u'imaginary_being',
 u'imaginary_creature',
 u'monster',
 u'mythical_creature',
 u'mythical_monster'}

In [59]:
rhino = wn.synsets('rhino')[0]
get_related_wordnet_lemmas(rhino)

{u'Ceratotherium_simum',
 u'Diceros_bicornis',
 u'Diceros_simus',
 u'Indian_rhinoceros',
 u'Rhinoceros_antiquitatis',
 u'Rhinoceros_unicornis',
 u'Rhinocerotidae',
 u'black_rhinoceros',
 u'family_Rhinocerotidae',
 u'odd-toed_ungulate',
 u'perissodactyl',
 u'perissodactyl_mammal',
 u'rhino',
 u'rhinoceros',
 u'rhinoceros_family',
 u'white_rhinoceros',
 u'woolly_rhinoceros'}

In [60]:
hotel = wn.synsets('hotel')[0]
get_related_wordnet_lemmas(hotel)

{u'Ritz',
 u'auberge',
 u'building',
 u'court',
 u'edifice',
 u'fleabag',
 u'holiday_resort',
 u'hostel',
 u'hostelry',
 u'hotel',
 u'hotel_room',
 u'inn',
 u'lodge',
 u'motor_hotel',
 u'motor_inn',
 u'motor_lodge',
 u'resort',
 u'resort_hotel',
 u'ski_lodge',
 u'spa',
 u'tourist_court'}

In [61]:
hungry = wn.synsets('hungry')[0]
get_related_wordnet_lemmas(hungry)

{u'empty',
 u'empty-bellied',
 u'esurient',
 u'famished',
 u'hungry',
 u'peckish',
 u'ravenous',
 u'sharp-set',
 u'starved',
 u'supperless'}

In [62]:
chair = wn.synsets('chair')[0]
get_related_wordnet_lemmas(chair)

{u'Eames_chair',
 u'armchair',
 u'back',
 u'backrest',
 u'barber_chair',
 u'chair',
 u'chair_of_state',
 u'chaise',
 u'chaise_longue',
 u'daybed',
 u'feeding_chair',
 u'fighting_chair',
 u'folding_chair',
 u'garden_chair',
 u'highchair',
 u'ladder-back',
 u'ladder-back_chair',
 u'lawn_chair',
 u'leg',
 u'rocker',
 u'rocking_chair',
 u'seat',
 u'side_chair',
 u'straight_chair',
 u'swivel_chair',
 u'tablet-armed_chair',
 u'wheelchair'}

In [63]:
Stalin = wn.synsets('Stalin')[0]
get_related_wordnet_lemmas(Stalin)

{u'Iosif_Vissarionovich_Dzhugashvili',
 u'Joseph_Stalin',
 u'Stalin',
 u'commie',
 u'communist'}

In [64]:
airplane = wn.synsets('airplane')[0]
get_related_wordnet_lemmas(airplane)

{u'accelerator',
 u'accelerator_pedal',
 u'aeroplane',
 u'airliner',
 u'airplane',
 u'amphibian',
 u'amphibious_aircraft',
 u'attack_aircraft',
 u'biplane',
 u'bomber',
 u'bonnet',
 u'cowl',
 u'cowling',
 u'delta_wing',
 u'escape_hatch',
 u'fighter',
 u'fighter_aircraft',
 u'fuel_pod',
 u'fuselage',
 u'gas',
 u'gas_pedal',
 u'gun',
 u'hangar_queen',
 u'heavier-than-air_craft',
 u'hood',
 u'hydroplane',
 u'jet',
 u'jet-propelled_plane',
 u'jet_plane',
 u'landing_gear',
 u'monoplane',
 u'multiengine_airplane',
 u'multiengine_plane',
 u'navigation_light',
 u'plane',
 u'pod',
 u'propeller_plane',
 u'radar_dome',
 u'radome',
 u'reconnaissance_plane',
 u'seaplane',
 u'ski-plane',
 u'tanker_plane',
 u'throttle',
 u'windscreen',
 u'windshield',
 u'wing'}

In [65]:
Batman = wn.synsets('Batman')[0]
get_related_wordnet_lemmas(Batman)

{u'attendant', u'attender', u'batman', u'tender'}

In [66]:
angry = wn.synsets('angry')[0]
get_related_wordnet_lemmas(angry)

{u'aggravated',
 u'angered',
 u'angry',
 u'black',
 u'choleric',
 u'enraged',
 u'furious',
 u'hot_under_the_collar',
 u'huffy',
 u'incensed',
 u'indignant',
 u'infuriated',
 u'irascible',
 u'irate',
 u'ireful',
 u'livid',
 u'mad',
 u'maddened',
 u'outraged',
 u'provoked',
 u'smoldering',
 u'smouldering',
 u'sore',
 u'umbrageous',
 u'wrathful',
 u'wroth',
 u'wrothful'}

In [67]:
comedy = wn.synsets('comedy')[0]
get_related_wordnet_lemmas(comedy)

{u'black_comedy',
 u'comedy',
 u"commedia_dell'arte",
 u'dark_comedy',
 u'drama',
 u'farce',
 u'farce_comedy',
 u'high_comedy',
 u'low_comedy',
 u'melodrama',
 u'seriocomedy',
 u'sitcom',
 u'situation_comedy',
 u'slapstick',
 u'tragicomedy',
 u'travesty'}

In [68]:
murderer = wn.synsets('murderer')[0]
get_related_wordnet_lemmas(murderer)

{u'Jack_the_Ripper',
 u'assassin',
 u'assassinator',
 u'bravo',
 u'butcher',
 u'criminal',
 u'crook',
 u'cutthroat',
 u'felon',
 u'fratricide',
 u'gun',
 u'gun_for_hire',
 u'gunman',
 u'gunslinger',
 u'hatchet_man',
 u'hired_gun',
 u'hit_man',
 u'hitman',
 u'iceman',
 u'infanticide',
 u'killer',
 u'liquidator',
 u'malefactor',
 u'manslayer',
 u'mass_murderer',
 u'murderer',
 u'murderess',
 u'outlaw',
 u'parricide',
 u'ripper',
 u'serial_killer',
 u'serial_murderer',
 u'shooter',
 u'slayer',
 u'torpedo',
 u'triggerman'}

In [73]:
wn.synsets('dOg')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [82]:
wn.synsets('spy')[0].lemma_names()

[u'spy', u'undercover_agent']

In [86]:
wn.synsets('spies')[0].lemma_names()

[u'spy', u'undercover_agent']

In [87]:
wn.synsets('spied')[0].lemma_names()

[u'descry', u'spot', u'espy', u'spy']

In [88]:
wn.synsets('spying')[0].lemma_names()

[u'spying']

In [98]:
wn.synsets('tailor')

[Synset('tailor.n.01'),
 Synset('tailor.v.01'),
 Synset('cut.v.07'),
 Synset('sew.v.02')]

In [110]:
from nltk.corpus import wordnet
syns = wordnet.synsets('black_rhinoceros')
for s in syns:
    for l in s.lemmas():
        print str(l.name()) + " " + str(l.count())


black_rhinoceros 0
Diceros_bicornis 0


In [103]:
l

Lemma('tailor.n.01.tailor')

In [104]:
l.count()

0

In [105]:
l.name()

u'tailor'

In [112]:
from nltk.stem import WordNetLemmatizer

lmt = WordNetLemmatizer()

In [115]:
lmt.lemmatize('smoldering')

'smoldering'

In [116]:
lmt.lemmatize('smouldering')

'smouldering'

In [117]:
lmt.lemmatize('superduperuglyduckling')

'superduperuglyduckling'

In [118]:
lmt.lemmatize('duckling')

'duckling'

In [121]:
lmt.lemmatize('ducks')

u'duck'

In [126]:
for syn in wn.synsets('sad'):
    print get_related_wordnet_lemmas(syn)

set([u'melancholic', u'bittersweet', u'melancholy', u'pensive', u'tragical', u'heavyhearted', u'sad', u'tragic', u'wistful', u'tragicomic', u'mournful', u'doleful', u'tragicomical'])
set([u'sorrowful', u'sad'])
set([u'distressing', u'sad', u'bad', u'pitiful', u'lamentable', u'deplorable', u'sorry'])


In [125]:
wn.synsets('sad')

[Synset('sad.a.01'), Synset('sad.s.02'), Synset('deplorable.s.01')]

In [127]:
for syn in wn.synsets('spy'):
    print get_related_wordnet_lemmas(syn)

set([u'armed_forces', u'spy', u'armed_services', u'infiltrator', u'espionage_agent', u'intelligence_agent', u'secret_agent', u'counterspy', u'double_agent', u'foreign_agent', u'military_machine', u'war_machine', u'intelligence_officer', u'sleeper', u'undercover_agent', u'Margarete_Gertrud_Zelle', u'military', u'mole', u'operative', u'Mata_Hari'])
set([u'spy', u'snoop', u'looker', u'viewer', u'shadower', u'watcher', u'snooper', u'tail', u'spectator', u'shadow', u'witness'])
set([u'espy', u'spot', u'sight', u'descry', u'spy'])
set([u'sleuth', u'monitor', u'snoop', u'spy', u'inquire', u'stag', u'investigate', u'enquire', u'supervise'])
set([u'spy', u'notice', u'detect', u'sight', u'spot', u'perceive', u'comprehend', u'discover', u'espy', u'observe', u'descry', u'find'])
set([u'inquire', u'spy', u'investigate', u'enquire'])


In [128]:
lmt.lemmatize('spied')

'spied'

In [129]:
lmt.lemmatize('spied', 'v')

u'spy'

In [134]:
wn.synsets('impROPriety')

[Synset('impropriety.n.01'),
 Synset('impropriety.n.02'),
 Synset('indecency.n.02'),
 Synset('familiarity.n.05')]

In [133]:
lmt.lemmatize('impropriety', 's')

'impropriety'

In [147]:
for pos in ['n','v','a','s','r']:
    print lmt.lemmatize('peddler', pos)

peddler
peddler
peddler
peddler
peddler


In [148]:
wn.synsets('peddler')

[Synset('peddler.n.01'), Synset('pusher.n.02')]

In [149]:
def get_related_wordnet_lemmas(grapheme):
    '''
    Return *all* lemmas for *all* synsets within distance-1 of *all* of the grapheme's synsets
    Don't need to muck around with cases, because WordNet is smart enough to handle that itself
    '''
    if not wn.synsets(grapheme):
        return []
    
    # These are all of the non-antonym non-POS synset relationships recognized by WordNet
    relationship_types = [
        'hypernyms','instance_hypernyms','hyponyms','instance_hyponyms', \
        'member_holonyms','substance_holonyms','part_holonyms','member_meronyms', \
        'substance_meronyms','part_meronyms','topic_domains','region_domains', \
        'usage_domains','attributes','entailments','causes','also_sees','verb_groups','similar_tos']
    
    lemmas = set()
    for synset in wn.synsets(grapheme):
        # distance-0 lemmas
        lemmas.update(synset.lemma_names())
        # distance-1 lemmas
        for relationship in relationship_types:
            related_synsets = getattr(synset, relationship)()
            lemmas.update(flatten([s.lemma_names() for s in related_synsets]))
        
    # Convert from unicode with str, and return the results as a list
    return list(map(str, lemmas))

In [153]:
gs = get_related_wordnet_lemmas('seller')
gs

['trafficker',
 'pedlar',
 'cosmetician',
 'dealer',
 'peddler',
 'ticket_agent',
 'seller',
 'booking_clerk',
 'cheap-jack',
 'vender',
 'selling_agent',
 'marketer',
 'merchant',
 'merchandiser',
 'vendor',
 'pitchman',
 'underseller',
 'hawker',
 'flower_girl',
 'fruiterer',
 'packman',
 'huckster']

In [158]:
def get_shortest_lemma(grapheme, lemmatizer=WordNetLemmatizer()):
    '''
    Need to check all possible parts of parts of speech to make sure that we identify the *shortest* lemma
    If don't manually check all parts of speech, it defaults to using the lemmas of the first synset, which may
    not be what we want.

    The parts of speech are:
      n : NOUN
      v : VERB
      a : ADJECTIVE
      s : ADJECTIVE SATELLITE
      r : ADVERB

    Note: be sure to predefine/pass in the lemmatizer in advance, so that it doesn't need to be recreated on each run
    '''
    shortest_lemma = grapheme
    for pos in ['n','v','a','s','r']:
        lemma = lemmatizer.lemmatize(grapheme, pos)
        if len(lemma) < len(shortest_lemma):
            shortest_lemma = lemma
    return shortest_lemma





In [159]:
gs = get_related_wordnet_lemmas('seller')
for g in gs:
    print g, get_shortest_lemma(g)

trafficker trafficker
pedlar pedlar
cosmetician cosmetician
dealer dealer
peddler peddler
ticket_agent ticket_agent
seller seller
booking_clerk booking_clerk
cheap-jack cheap-jack
vender vender
selling_agent selling_agent
marketer marketer
merchant merchant
merchandiser merchandiser
vendor vendor
pitchman pitchman
underseller underseller
hawker hawker
flower_girl flower_girl
fruiterer fruiterer
packman packman
huckster huckster


In [161]:
get_shortest_lemma('spied')

u'spy'

In [171]:
gr = get_related_wordnet_lemmas('sad')
for g in gr:
    gl = get_shortest_lemma(g)
    gs = porter_stemmer.stem(gl)
    if not wn.synsets(gs):
        gs = ''
    print g, gl, gs

melancholic melancholic 
bittersweet bittersweet bittersweet
melancholy melancholy 
pensive pensive 
tragical tragical tragic
heavyhearted heavyhearted 
sorrowful sorrowful sorrow
bad bad bad
sad sad sad
deplorable deplorable 
distressing distress distress
sorry sorry 
tragic tragic tragic
wistful wistful 
tragicomic tragicomic 
mournful mournful mourn
lamentable lamentable lament
doleful doleful dole
pitiful pitiful 
tragicomical tragicomical 


In [166]:
# stemming is ONLY valid if the stem is present in wordnet
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()