In [30]:
'''
In this program, we will try to determine whether a pokemon is legendary 
or not based on their stats. The dataset contains the id, name, types, hp,
attack, defense, sp attack, sp defense, speed, generation, and legendary
status. 
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [2]:
#We begin by reading our csv file into a dataframe using pandas
pokemon_raw = pd.read_csv('pokemon.csv')
pokemon_raw.head(10)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,314,44,48,65,50,64,43,1,False


In [3]:
#Exploratory data anlaysis
#We can see there are 800 unique pokemon
print(pokemon_raw.describe())

#Which columns contain missing values
pokemon_raw.columns[pokemon_raw.isnull().any()][:]

#Replacing missing values in column 2 with the word NONE
#This new dataframe will be called pokemon_df
pokemon_df = pokemon_raw.replace(np.NaN,'NONE')

                #      Total          HP      Attack     Defense     Sp. Atk  \
count  800.000000  800.00000  800.000000  800.000000  800.000000  800.000000   
mean   362.813750  435.10250   69.258750   79.001250   73.842500   72.820000   
std    208.343798  119.96304   25.534669   32.457366   31.183501   32.722294   
min      1.000000  180.00000    1.000000    5.000000    5.000000   10.000000   
25%    184.750000  330.00000   50.000000   55.000000   50.000000   49.750000   
50%    364.500000  450.00000   65.000000   75.000000   70.000000   65.000000   
75%    539.250000  515.00000   80.000000  100.000000   90.000000   95.000000   
max    721.000000  780.00000  255.000000  190.000000  230.000000  194.000000   

          Sp. Def       Speed  Generation  
count  800.000000  800.000000   800.00000  
mean    71.902500   68.277500     3.32375  
std     27.828916   29.060474     1.66129  
min     20.000000    5.000000     1.00000  
25%     50.000000   45.000000     2.00000  
50%     70.0000

In [4]:
#Proportion of Legendary pokemon
pokemon_df['Legendary'].sum()/pokemon_df['Legendary'].count()

0.08125

In [5]:
#The important information for prediction are type1, type2, total, hp, attack,
#defense, sp atk, sp def, and speed
#The target is if the pokemon is legendary or not
X = pokemon_df[['Type 1','Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
Y = pokemon_df[['Legendary']]

In [6]:
#Changing the type of strings into integers systematically
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#turning strings into integers, each unique string is a unique integer
X = np.asarray(X)
Y = np.asarray(Y).astype('int')

#Type 1
le.fit(X[:,0])
X[:,0] = le.transform(X[:,0])

#Type 2
le.fit(X[:,1])
X[:,1] = le.transform(X[:,1])

In [7]:
'''
Making train and test data sets
'''
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.1, random_state=0)

#Proportion of legendary in full dataset is 0.08
print('The legendary proportion in the train set is %s'%(sum(trainY)/len(trainY)))
print('The legendary proportion in the test set is %s'%(sum(testY)/len(testY)))

The legendary proportion in the train set is [0.08194444]
The legendary proportion in the test set is [0.075]


## Decision Tree

In [8]:
#Setting up a decision tree to classify our pokemon
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(trainX,trainY)

In [31]:
#Making Predictions and assessing accuracy
tree_predictions = tree_clf.predict(testX)
tree_acc = accuracy_score(testY,tree_predictions)
print('The accuracy of the decision tree is %s percent'%(100*tree_acc))

#Confusion matrix
confusion_matrix(testY, tree_predictions)

#AUC Score
tree_auc = roc_auc_score(testY, tree_predictions)
print('The AUC of the decision tree is %.3f'%(tree_auc))

The accuracy of the decision tree is 97.5 percent
The AUC of the decision tree is 0.910
