In [None]:
'''
In this notebook, I'm stepping up the difficulty a bit and seeing whether or not the predicter can see
what stage a Pokémon is at. Unlike the Legendary classification that only had 2 classifications, we're using 5
this time. Let's start with the straight forward ones. 1 indicates that a Pokémon is not fully evolved, meaning that
it is capable of change and it has not reached its final form. 2 indicates a fully evolved Pokémon, meaning that it
evolved from a previous form and can no longer evolve. 3 indicates a legendary Pokémon, which is a title given to
a Pokémon from the people who make the game. They tend to be stronger and a lot bigger than a regular Pokémon. 0 and 5
are special cases. 0 is 1-stage Pokémon, meaning that nothing evolves into it, and it doesn't evolve into anything.
4 is a Mythical Pokémon, which is a subset of Legendaries. Mythical Pokémon are typically about as strong (maybe a bit
weaker) than the typical Legendary, but they tend to be small in size.

'''

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

pok = pd.read_csv('pokemon.csv')

pok

Unnamed: 0,#,Name,Stage,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Heights (m),Weight (kg),Generation
0,1.0,Bulbasaur,1,Grass,Poison,318,45,49,49,65,65,45,0.7,6.9,1
1,2.0,Ivysaur,1,Grass,Poison,405,60,62,63,80,80,60,1.0,13.0,1
2,3.0,Venusaur,2,Grass,Poison,525,80,82,83,100,100,80,2.0,100.0,1
3,4.0,Charmander,1,Fire,,309,39,52,43,60,50,65,0.6,8.5,1
4,5.0,Charmeleon,1,Fire,,405,58,64,58,80,65,80,1.1,19.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,,Glastrier,3,Ice,,580,100,145,130,65,110,30,2.2,800.0,8
980,,Spectrier,3,Ghost,,580,100,65,60,145,80,130,2.0,44.5,8
981,,Calyrex,4,Psychic,Grass,500,100,80,80,80,80,80,1.1,7.7,8
982,,Calyrex,3,Psychic,Ice,680,100,165,150,85,130,50,2.4,809.1,8


In [2]:
type(pok)

pandas.core.frame.DataFrame

In [3]:
poke = pok.to_numpy()

In [4]:
X = poke[:,3:14]

In [5]:
'''
I'm using one hot encoding to turn the types into features here. The loop below is used to help the machine
differentiate between a Pokémon's primary and secondary typing. "Type 2 Blank" indicates that a Pokémon does not have
a secondary typing, and is kind of a placeholder feature.
'''

import numpy as np
'''
ONLY RUN THIS CELL ONCE
'''
for i in range(len(X)):
    X[i,0] = 'Type 1 ' + X[i,0]
    if isinstance(X[i,1], str):
        X[i,1] = 'Type 2 ' + X[i,1]
    else:
        X[i,1] = 'Type 2 Blank'

In [6]:
'''
The one-hot encoding is done here. This splits each of a Pokémon's typing into 18 different features, 
asking yes or no if a Pokémon is a certain type. For example, Bulbasaur, the first Pokémon is Grass/Poison. This
means it will have a 1 in the "Type 1 Grass" column and "Type 2 Poison" column.
'''

type1 = pd.get_dummies(X[:,0])

type2 = pd.get_dummies(X[:,1])


In [7]:
typing = pd.concat([type1, type2], axis=1)

typing

Unnamed: 0,Type 1 Bug,Type 1 Dark,Type 1 Dragon,Type 1 Electric,Type 1 Fairy,Type 1 Fighting,Type 1 Fire,Type 1 Flying,Type 1 Ghost,Type 1 Grass,...,Type 2 Ghost,Type 2 Grass,Type 2 Ground,Type 2 Ice,Type 2 Normal,Type 2 Poison,Type 2 Psychic,Type 2 Rock,Type 2 Steel,Type 2 Water
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
980,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
981,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
982,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
stats = pok.loc[:,['Total','HP','Attack','Defense','Sp. Atk', 'Sp. Def', 'Speed','Heights (m)','Weight (kg)']]

stats

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Heights (m),Weight (kg)
0,318,45,49,49,65,65,45,0.7,6.9
1,405,60,62,63,80,80,60,1.0,13.0
2,525,80,82,83,100,100,80,2.0,100.0
3,309,39,52,43,60,50,65,0.6,8.5
4,405,58,64,58,80,65,80,1.1,19.0
...,...,...,...,...,...,...,...,...,...
979,580,100,145,130,65,110,30,2.2,800.0
980,580,100,65,60,145,80,130,2.0,44.5
981,500,100,80,80,80,80,80,1.1,7.7
982,680,100,165,150,85,130,50,2.4,809.1


In [9]:
'''
Here I just merged the tables to create the big feature table for all the important information about each Pokémon.
It denotes their typing and has all their stats and physical featues like height and weight.
'''
Xf = pd.concat([typing, stats], axis=1)
Xf

Unnamed: 0,Type 1 Bug,Type 1 Dark,Type 1 Dragon,Type 1 Electric,Type 1 Fairy,Type 1 Fighting,Type 1 Fire,Type 1 Flying,Type 1 Ghost,Type 1 Grass,...,Type 2 Water,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Heights (m),Weight (kg)
0,0,0,0,0,0,0,0,0,0,1,...,0,318,45,49,49,65,65,45,0.7,6.9
1,0,0,0,0,0,0,0,0,0,1,...,0,405,60,62,63,80,80,60,1.0,13.0
2,0,0,0,0,0,0,0,0,0,1,...,0,525,80,82,83,100,100,80,2.0,100.0
3,0,0,0,0,0,0,1,0,0,0,...,0,309,39,52,43,60,50,65,0.6,8.5
4,0,0,0,0,0,0,1,0,0,0,...,0,405,58,64,58,80,65,80,1.1,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,0,0,0,0,0,0,0,0,0,0,...,0,580,100,145,130,65,110,30,2.2,800.0
980,0,0,0,0,0,0,0,0,1,0,...,0,580,100,65,60,145,80,130,2.0,44.5
981,0,0,0,0,0,0,0,0,0,0,...,0,500,100,80,80,80,80,80,1.1,7.7
982,0,0,0,0,0,0,0,0,0,0,...,0,680,100,165,150,85,130,50,2.4,809.1


In [10]:
type(Xf)

pandas.core.frame.DataFrame

In [11]:
X_final = Xf.to_numpy()
X_final

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 4.500e+01, 7.000e-01,
        6.900e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 6.000e+01, 1.000e+00,
        1.300e+01],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 8.000e+01, 2.000e+00,
        1.000e+02],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 8.000e+01, 1.100e+00,
        7.700e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 5.000e+01, 2.400e+00,
        8.091e+02],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.500e+02, 2.400e+00,
        5.360e+01]])

In [12]:
y = poke[:,2]

y = y.astype('int')

y

array([1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1,
       1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 0, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2,
       1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 0, 0, 1, 2, 0, 0,
       1, 2, 2, 2, 1, 1, 2, 1, 2, 0, 2, 3, 3, 3, 1, 1, 2, 3, 4, 1, 1, 2,
       1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1,
       2, 1, 0, 2, 2, 1, 2, 0, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 2, 1, 2, 1,
       1, 0, 1, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1, 0, 0, 1, 2, 1, 1, 1, 0, 2,
       3, 3, 3, 1, 1, 2, 3, 3, 4, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2,
       1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2,

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
        train_test_split(X_final, y, test_size=0.3, random_state=1, stratify=y)

In [14]:
'''
I decided to use a Random Forest Classifier because it's good at classification and handling data with
high-dimensionality. We're working with 40+ features here, so this is definitely the way to go.
'''

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100,max_depth=7)

model.fit(X_train,y_train)

RandomForestClassifier(max_depth=7)

In [15]:
y_predict = model.predict(X_test)

In [16]:
'''
These results are a lot better than I thought. That means the Random Forest Classifier was able to predict what
stage a Pokémon is currently at with 80% accuracy. We can see where it went wrong using a confusion matrix in the
next cell.
'''

accuracy_score(y_test, y_predict)

0.8141891891891891

In [18]:
'''
The results here are actually really interesting. So, the rows indicate the true labels while the columns are the
predicted labels. Knowing this, we can clearly see which classification the algorithm struggled with the most: Single
Staged Pokémon. Looking at the first row, it only guessed 3 of them correctly out of the 30 data points there were, 
giving it a 10% accuracy. Looking at the incorrect predictions, it seems like the forest couldn't decide whether
most single staged Pokémon were fully evolved or not (classifications 1 and 2). This makes a ton of since. Most single
stage Pokémon have somewhat middling stats. They tend to be stronger than non-fully evolved Pokémon, but not quite as
strong as a fully evolved Pokémon. Of course there are exceptions, but it makes sense why the machine struggled to 
classify them correctly since they lie in this "statistical limbo" of sorts. Another interesting aspect to note is 
that the three correct predictions for Single-Staged Pokémon were the ONLY predictions. No other classification
has an error that predicted single stage.

The machine was VERY accurate when predicting whether a Pokémon was not fully evolved or not. Looking at rows 1 and 2,
there are few mistakes and a lot of correct predictions. The machine predicted that 118 Pokémon were not fully evolved
correctly, and made the mistake of classifying some of them as fully evolved, which makes sense, since some not fully
evolved Pokémon are just as strong as some fully evolved Pokémon. The machine only made 3 mistakes when predicting 
fully evolved Pokémon, 2 as not fully evolved and 1 as a legendary. 

When it comes to the legendaries, it's a mixed bag. Pure legendaries were predicted correctly 14 out of 24 times, and
the 10 mistakes come from incorrect fully evolved predictions, which makes sense, as there are some weaker legendaries
and super strong fully evolved Pokémon, which puts them in the same ballpark. Interestingly enough, no legendaries 
were misclassified as mythicals. 

Mythical Pokémon had an accuracy of 50% with 4 out of 8 correct predictions.
'''
confusion_matrix(y_test, y_predict)



array([[  3,  11,  16,   0,   0],
       [  0, 118,  11,   0,   0],
       [  0,   2, 102,   1,   0],
       [  0,   0,  10,  14,   0],
       [  0,   0,   3,   1,   4]])

In [20]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 37 (0.299417)
2. feature 44 (0.113274)
3. feature 42 (0.094877)
4. feature 38 (0.073321)
5. feature 45 (0.072391)
6. feature 39 (0.068332)
7. feature 40 (0.067298)
8. feature 41 (0.065594)
9. feature 43 (0.058951)
10. feature 3 (0.011511)
11. feature 18 (0.006262)


In [22]:
'''
When comparing this to the Legendary Predicter "most important" features, we see a lot of familiar ones. Total is still
the top decider, and Weight is in about the same place. However, Height became a lot more important and it totally 
makes sense. As a Pokémon evolves, it tends to get bigger, so it's a safe assumption to say that most small Pokémon
are not fully evolved. Also, height is important when making distinctions between fully evolved, legendaries, and 
mythical Pokémon too. Fully evolved Pokémon are normally medium sized, legendaries are humongous, and mythicals are
usually quite small. The data is consistent with actual trends seen in the Pokémon games.
'''
most_important = Xf.iloc[:,[37,44,42,38,45,39,40,41,43,3,18]]
most_important


Unnamed: 0,Total,Heights (m),Sp. Def,HP,Weight (kg),Attack,Defense,Sp. Atk,Speed,Type 1 Electric,Type 2 Blank
0,318,0.7,65,45,6.9,49,49,65,45,0,0
1,405,1.0,80,60,13.0,62,63,80,60,0,0
2,525,2.0,100,80,100.0,82,83,100,80,0,0
3,309,0.6,50,39,8.5,52,43,60,65,0,1
4,405,1.1,65,58,19.0,64,58,80,80,0,1
...,...,...,...,...,...,...,...,...,...,...,...
979,580,2.2,110,100,800.0,145,130,65,30,0,1
980,580,2.0,80,100,44.5,65,60,145,130,0,1
981,500,1.1,80,100,7.7,80,80,80,80,0,0
982,680,2.4,130,100,809.1,165,150,85,50,0,0
