In [2]:
# use natural language toolkit
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
# word stemmer


In [1]:
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})

training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

training_data.append({"class":"booking", "sentence":"Book a theater performance in San Francisco 5-7 July"})
training_data.append({"class":"booking", "sentence":"What events are available in Chicago Dec 6"})
training_data.append({"class":"booking", "sentence":"Do you have any events available in New York"})
training_data.append({"class":"booking", "sentence":"San Diego 6-10 July"})
print ("%s sentences of training data" % len(training_data))

12 sentences of training data


In [10]:
# capture unique stemmed words in the training corpus
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

# loop through each sentence in our training data
for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
#         if word not in stop_words:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)


Corpus words and counts: {'new': 1, 'go': 1, 'talk': 1, 'nice': 2, '6': 1, 'later': 1, 'book': 1, 'today': 1, 'avail': 2, 'dec': 1, 'perform': 1, 'what': 1, 'good': 1, '5-7': 1, 'san': 2, 'diego': 1, 'york': 1, 'day': 4, 'do': 1, 'event': 2, 'see': 1, '?': 3, 'chicago': 1, 'juli': 2, 'theater': 1, '6-10': 1, 'francisco': 1, 'soon': 1} 

Class words: {'greeting': ['?', 'day', '?', 'good', 'day', 'go', 'today', '?'], 'goodbye': ['nice', 'day', 'see', 'later', 'nice', 'day', 'talk', 'soon'], 'booking': ['book', 'theater', 'perform', 'san', 'francisco', '5-7', 'juli', 'what', 'event', 'avail', 'chicago', 'dec', '6', 'do', 'event', 'avail', 'new', 'york', 'san', 'diego', '6-10', 'juli']}


In [11]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score

In [12]:
# return the class with highest score for sentence
def classify(sentence):
    high_class = None
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score(sentence, c, show_details=False)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class, high_score

In [13]:
print(classify('Find events in Miami'))

('booking', 0.5)
