In [3]:
# use natural language toolkit
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [28]:
# 2 classes of training data
training_data = []
training_data.append({"class":"eat", "sentence":"how about a lunch?"})
training_data.append({"class":"eat", "sentence":"up for a snack?"})
training_data.append({"class":"eat", "sentence":"let's go out for a lunch"})
training_data.append({"class":"eat", "sentence":"let's go for a bite"})

training_data.append({"class":"noeat", "sentence":"i got some work to do"})
training_data.append({"class":"noeat", "sentence":"i'm not hungry"})
training_data.append({"class":"noeat", "sentence":"i'm full, someother time"})
training_data.append({"class":"noeat", "sentence":"not now"})
training_data.append({"class":"noeat", "sentence":"lets not eat"})

print ("%s sentences of training data" % len(training_data))

9 sentences of training data


In [25]:
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

# loop through each sentence in our training data
for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)


Corpus words and counts: {'let': 3, 'do': 1, 'go': 2, 'snack': 1, 'ful': 1, "'m": 2, 'tim': 1, 'someoth': 1, 'i': 3, 'not': 3, 'lunch': 2, 'work': 1, 'up': 1, 'hungry': 1, 'som': 1, 'out': 1, 'to': 1, 'now': 1, 'how': 1, 'about': 1, 'got': 1, 'for': 3, 'a': 4, 'eat': 1, 'bit': 1, ',': 1} 

Class words: {'noeat': ['i', 'got', 'som', 'work', 'to', 'do', 'i', "'m", 'not', 'hungry', 'i', "'m", 'ful', ',', 'someoth', 'tim', 'not', 'now', 'let', 'not', 'eat'], 'eat': ['how', 'about', 'a', 'lunch', 'up', 'for', 'a', 'snack', 'let', 'go', 'out', 'for', 'a', 'lunch', 'let', 'go', 'for', 'a', 'bit']}


In [26]:
# we can now calculate a score for a new sentence
sentence = "good day for us to have lunch?"

# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += 1
            
            if show_details:
                print ("   match: %s" % stemmer.stem(word.lower() ))
    return score

In [27]:
# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score(sentence, c)))

   match: to
Class: noeat  Score: 1 

   match: for
   match: lunch
Class: eat  Score: 2 



In [20]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score

In [21]:
# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score_commonality(sentence, c)))

   match: to (1.0)
Class: noeat  Score: 1.0 

   match: day (1.0)
   match: for (0.5)
   match: lunch (0.5)
Class: eat  Score: 2.0 



In [22]:
# return the class with highest score for sentence
def classify(sentence):
    high_class = None
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score_commonality(sentence, c, show_details=False)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class, high_score

In [23]:
classify("lets go out and not eat")

('eat', 1.8333333333333333)