In [1]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re

In [2]:
with open('WordNet2/Words.cat', 'r') as file:
    data = file.readlines()
    nouns = data[25040:94946]
    adjectives = data[0:21255]
    adverbs = data[21256:25039]
    
tidy_nouns = []
for noun in nouns:
    noun = noun.replace(' (1)', '')
    noun = noun.replace('\t', '')
    noun = noun.replace('\n', '')
    noun = noun.lower()
    tidy_nouns.append(noun)
    
tidy_adjs = []
for adj in adjectives:
    adj = adj.replace(' (1)', '')
    adj = adj.replace('\t', '')
    adj = adj.replace('\n', '')
    adj = adj.lower()
    tidy_adjs.append(adj)
    
tidy_advs = []
for adv in adverbs:
    adv = adv.replace(' (1)', '')
    adv = adv.replace('\t', '')
    adv = adv.replace('\n', '')
    adv = adv.lower()
    tidy_advs.append(adv)

def find_classes(word_tagged):
    '''
    Classifies a word using WordStat. Returns a list of classes the word belongs to.
    '''
    Lem = WordNetLemmatizer()

    current_class = ''
    classes = []
    
    word = Lem.lemmatize(re.sub('\W+','', word_tagged[0]).lower())
    tag = word_tagged[1]
    
    if tag in ('NN', 'NNS', 'NNPS', 'NNP'):
        if tidy_nouns.count(word) > 0:
            for noun in tidy_nouns:
                #changing the noun class (e.g. when noun.food is encountered, class is 'food' until the next noun
                #class is encountered)
                if 'noun.' in noun:
                    noun_split = noun.split('.')
                    current_class = noun_split[1]
                elif noun == word and current_class not in classes:
                        classes.append(current_class)
    elif tag in ('JJ', 'JJR', 'JJS'):
        if tidy_adjs.count(word) > 0:
            for adj in tidy_adjs:
                if 'adj.' in adj:
                    adj_split = adj.split('.')
                    current_class = adj_split[1]
                elif adj == word and current_class not in classes:
                        classes.append(current_class)
    elif tag in ('RB', 'RBR', 'RBS'):
        if tidy_advs.count(word) > 0:
            for adv in tidy_advs:
                if 'adv.' in adv:
                    adv_split = adv.split('.')
                    current_class = adv_split[1]
                elif adv == word and current_class not in classes:
                        classes.append(current_class)
    return classes

In [3]:
targets = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'NN', 'NNS', 'NNPS', 'NNP']

def categorize_words(sentence):
    tagged_words = nltk.pos_tag(sentence.split())
    for word in tagged_words:
        if word[1] in targets:
            print('Word:', word[0])
            print('Categories:', find_classes(word))

In [4]:
categorize_words('Look at all those beautiful dandelions!')

Word: beautiful
Categories: ['all']
Word: dandelions!
Categories: ['plant']
