In [None]:
import pandas as pd  

In [None]:
path = 'pokemon_data.csv'
# the converters argument will let Python read the ability column as a list, not a string
pokemon = pd.read_csv(path, converters={'abilities' : pd.eval})

In [None]:
def move_features(abilities):
    if len(abilities) == 1:
        abilities.append('None')
    return {'first_move': abilities[0], 'second_move': abilities[1]}

move_features(['Overgrow', 'Chlorophyll'])

In [None]:
# now we want to combine abilities with type
# we can do this with the zip function - which will produce an iterator of tuples
# each tuple will have the format: (['ability1', 'ability2'], type)
zipped_features = zip(pokemon.abilities, pokemon.type1)

In [None]:
# I know need the ability list to have the syntax needed for a classifier
# for this, we can use our move_feature function
featuresets = [(move_features(abilities), p_type) for abilities, p_type in zipped_features ]
featuresets

In [None]:
import random
# best to shuffle before training
# or else the earlier classifications will get more training than the classifications that come later
# ie, all the bugs get trained, none of the water types 
random.shuffle(featuresets)

In [None]:
import nltk
import math

split_num = math.floor(len(featuresets)*.8)

# split feature sets into training and test sets (here we'll try 80% train, 20% test)
train_set, test_set = featuresets[:split_num], featuresets[split_num+1:]

In [None]:
# build a classifier based on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
classifier.classify(move_features(['Stench', 'Sticky Hold', 'Aftermath']))

In [None]:
# Lets evaluate the classifier on a much larger quantity of unseen data - robustness of our model
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
# Precision: the number of True Positives divided by the number of True Positives and False Positives
# --> it is the number of positive predictions divided by the total number of positive class values predicted
# --> effectively a measure of a classifier's exactness (low precision indicates a large number of False Positives)


# Recall: the number of True Positives divided by the number of True Positives and the number of False Negatives
# --> number of positive predictions divided by the number of positive class values in the test data.
# --> effectively a measure of a classifier's completeness (low recall indicates many False Negatives)


# F1 Score: (also called F measure) conveys the balance between the precision and recall

In [None]:
# Finally, we can examine the classifier to determine which features it found most 
# effective for distinguishing the pokemone type

# Return a list of the 'most informative' features used by this
#         classifier.  For the purpose of this function, the
#         informativeness of a feature ``(fname,fval)`` is equal to the
#         highest value of P(fname=fval|label), for any label, divided by
#         the lowest value of P(fname=fval|label), for any label:

classifier.show_most_informative_features(12)