In [57]:
import pandas as pd

In [58]:
# Data frame will have one line per training example, the label (intent classification) then text as a single string
df = pd.read_csv('training_data.csv', names=["label", "text"])

In [59]:
df.shape

(108, 2)

In [60]:
df.head()

Unnamed: 0,label,text
0,move_forward,"sucky, move 3 spaces forward"
1,move_forward,go forward 4 squares
2,move_forward,go 2 spaces forward
3,move_forward,move 3 spaces forward
4,move_forward,"sucky, move yourself 4 spaces forward"


In [61]:
df.label.value_counts()

agent_info       17
get_gold         16
go_to            12
go_home          10
find_gold        10
suck_dirt         8
move_backward     7
move_forward      7
find_dirt         6
turn_left         6
turn_right        6
turn_around       3
Name: label, dtype: int64

In [62]:
#  To recognize coordinates in the text, for example (2,5) and
#  convert to a tuple

from ast import literal_eval
def to_coord(word):
    try:
        t = literal_eval(word)
        if type(t) == tuple:
            return t
        return None
    except:
        return None
    
def is_coord(word):
    return to_coord(word) != None

In [64]:
print(to_coord('(3,5)'))
print(to_coord('abc'))

(3, 5)
None


In [65]:
# Three features of an input command that might be useful, but we can't do
# in the vectorization phase
#   1.  Does the command end with a question mark -- if so, maybe it's 
#        a request for information rather than a command to do something
#   2.  Does the command contain at least one coordinate.  All the commands
#        in the training set have either 0 or 1 coordinate, but for a richer
#        command set, counting the number of coordinates might be useful
#   3.  Does the command contain at least one number.  For simplicitly we're 
#        only detecting numbers as digit sequences, so "three" is not a number unfortunately

df['ends_with_question'] = df.text.apply(lambda t: t[-1] == '?')
df['words'] = df.text.apply(lambda s: s.split())
df['has_coord'] = df.words.apply(lambda ww: True in [is_coord(w) for w in ww])
df['has_number'] = df.words.apply(lambda ww: True in [w.isdigit() for w in ww])
df.drop(columns='words', inplace=True)

In [66]:
# Vectorize the training set -- input is the raw text for each training example
#  and output is a vector with one element/column per term
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
dfVec = pd.DataFrame(cv.fit_transform(df['text']).toarray(), columns = cv.get_feature_names())


In [67]:
dfVec.head()

Unnamed: 0,and,any,anywhere,are,around,at,back,backward,backwards,base,...,travel,turn,up,want,what,where,yo,you,your,yourself
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
# Concatenate our three non-vectorized features and split into X (the input)
# and y (the 'correct' classification).   X is capitalized because it is a matrix, 
# y is a vector.   y must have the same number of elements as the number of rows in X.

dfVectorized = pd.concat([df, dfVec], axis=1)
X = dfVectorized.drop(columns=['label', 'text'])
y = dfVectorized['label']

In [69]:
# It's important to measure the classifier's accuracy on data it has not seen
# before.  So we break our X and y into a "training set" and a "test set"
# We train the classifier on the training set then evaluate its accuracy on 
# the test set.   
#
# This training set is too small for comfort (approx 100 cases). If we take away 10%, we're only 
# testing on 10 examples, which is probably too small to get good coverage of all the commands
# the agent will be seeing in practice

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [73]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(97, 72)
(97,)
(11, 72)
(11,)


In [74]:
# This is our classifier -- we show it the training set only
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
# Prediction on the test set.  Each of these is a "class label"
# from the training set -- in our case the class label is the intent prediction
print(nb.predict(X_test))

['move_forward' 'get_gold' 'move_backward' 'turn_right' 'turn_right'
 'get_gold' 'find_gold' 'move_forward' 'turn_left' 'go_to' 'go_to']


In [78]:
# Now compute the accuracy of the classifier on the test set
# The accuracy is just the percentage of the time the classifier
#  prediction agrees with th e case label
from sklearn.metrics import accuracy_score
accuracy_score(y_test, nb.predict(X_test))

0.9090909090909091

In [79]:
# Want to see cases where the prediction was wrong
df['predicted'] = nb.predict(X)
df[df.predicted != df.label]

Unnamed: 0,label,text,ends_with_question,has_coord,has_number,predicted
87,find_dirt,where is some dirt,False,False,False,find_gold


In [80]:
# Right before exporting, train the classifier on the whole
# data set, not just the test set.
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [81]:
# We need to save both the classifier and the vectorizer, since
# when we get a command text input, we need to prepare it for the 
# classifier the same way we prepared the test data.  

# Note file location is important -- the file commandprocessorml.py looks
# for the models in a subfolder agents/intent_classifier which is relative
# to the root of the code tree
import pickle
pickle.dump(nb, open('classifier.sav', 'wb'))
pickle.dump(cv, open('vectorizer.sav', 'wb'))