Tarefa: Utilizar Machine Learning para classificar Pokemons no tipo primário - Grama, Fogo etc. - dado seus atributos.

Importando Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.utils import shuffle

Leitura do dataset

In [2]:
csv_path = "Pokemon.csv"
df = pd.read_csv(csv_path)
df.head() #mostra as 5 primeiras linhas

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


Criar uma categoria numérica para o tipo secundário

In [3]:

# Create a label (category) encoder object

df.replace(np.NaN, "sem" , inplace = True)
le = preprocessing.LabelEncoder()
le.fit(df['Type 2'])
list(le.classes_)
x2 = le.transform(df['Type 2'])
df['Tipo2'] = x2
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Tipo2
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,13
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,13
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,13
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,13
4,4,Charmander,Fire,sem,309,39,52,43,60,50,65,1,False,18


Embaralhar o dataset

In [4]:
df = df.sample(frac=1)
#df = shuffle(df)
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Tipo2
662,601,Klinklang,Steel,sem,520,60,100,115,70,85,90,5,False,18
31,26,Raichu,Electric,sem,485,60,90,55,90,80,110,1,False,18
699,638,Cobalion,Steel,Fighting,580,91,90,129,90,72,108,5,True,5
102,94,GengarMega Gengar,Ghost,Poison,600,60,65,80,170,95,130,1,False,13
116,108,Lickitung,Normal,sem,385,90,55,75,60,75,30,1,False,18


In [5]:
y = np.array(df['Type 1'])
x = np.array(df.drop(['Name', 'Type 1', 'Type 2'], axis = 1).astype(int))


In [6]:
#selecionar dados de treino e de teste
x_treino = x[0:700]
y_treino = y[0:700]
x_teste = x[700:800]
y_teste = y[700:800]

In [7]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
clf = GaussianNB()
clf.fit(x_treino, y_treino)
y_pred = clf.predict(x_teste)
y_true = y_teste
accuracy_score(y_true, y_pred)

0.19

In [8]:
#Decision Tree
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(x_treino, y_treino)
y_pred = clf.predict(x_teste)
y_true = y_teste
accuracy_score(y_true, y_pred)

0.23999999999999999

In [9]:

#Support vector Machine
#from sklearn import svm
#clf = svm.SVC(kernel = 'linear')
#clf.fit(x_treino, y_treino) 
#y_pred = clf.predict(x_teste)
#y_true = y_teste
#accuracy_score(y_true, y_pred)

Tarefa difícil com as Técnicas simples de Machine Learning pois não há muita correlação entre o tipo e os pontos de atributo. Talvez com deep neural networks os resultados melhorem.

...

Algumas análises exploratórias:

In [10]:
#acrescentando uma coluna numérica para o tipo primário
df.replace(np.NaN, "Tipo" , inplace = True)
le = preprocessing.LabelEncoder()
le.fit(df['Type 1'])
list(le.classes_)
x2 = le.transform(df['Type 1'])
df['Tipo'] = x2
df.head()


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Tipo2,Tipo
662,601,Klinklang,Steel,sem,520,60,100,115,70,85,90,5,False,18,16
31,26,Raichu,Electric,sem,485,60,90,55,90,80,110,1,False,18,3
699,638,Cobalion,Steel,Fighting,580,91,90,129,90,72,108,5,True,5,16
102,94,GengarMega Gengar,Ghost,Poison,600,60,65,80,170,95,130,1,False,13,8
116,108,Lickitung,Normal,sem,385,90,55,75,60,75,30,1,False,18,12


In [11]:
#acrescentando hot labels para cada tipo
dummy_variable_1 = pd.get_dummies(df["Type 1"])
df = pd.concat([df, dummy_variable_1], axis=1)
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
662,601,Klinklang,Steel,sem,520,60,100,115,70,85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
31,26,Raichu,Electric,sem,485,60,90,55,90,80,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
699,638,Cobalion,Steel,Fighting,580,91,90,129,90,72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
102,94,GengarMega Gengar,Ghost,Poison,600,60,65,80,170,95,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116,108,Lickitung,Normal,sem,385,90,55,75,60,75,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Tipos primários

In [12]:

Tipos = list(le.classes_)
for i, k in enumerate (Tipos):
    print(i, k)
Tipos

0 Bug
1 Dark
2 Dragon
3 Electric
4 Fairy
5 Fighting
6 Fire
7 Flying
8 Ghost
9 Grass
10 Ground
11 Ice
12 Normal
13 Poison
14 Psychic
15 Rock
16 Steel
17 Water


['Bug',
 'Dark',
 'Dragon',
 'Electric',
 'Fairy',
 'Fighting',
 'Fire',
 'Flying',
 'Ghost',
 'Grass',
 'Ground',
 'Ice',
 'Normal',
 'Poison',
 'Psychic',
 'Rock',
 'Steel',
 'Water']

Tipo 2 (Dragão) se sobressai no total de pontos

In [None]:
sns.regplot(x="Tipo", y ="Total", data = df, x_estimator=np.mean, )

Tipos 15 e 16 (Rock e Steel) possuem defesa média muito maior que a média dos outros tipos

In [None]:
sns.regplot(x="Tipo", y ="Defense", data = df, x_estimator=np.mean)

In [None]:
sns.regplot(x="Rock", y ="Defense", data = df, x_estimator=np.mean)
sns.regplot(x="Steel", y ="Defense", data = df, x_estimator=np.mean)
#sns.regplot(x="Steel", y ="Defense", data = df, x_jitter=.1)
print(df[["Steel", "Defense"]].corr())
print(df[["Rock", "Defense"]].corr('spearman'))
print(df[["Rock", "Defense"]].corr('pearson'))
print(df[["Rock", "Defense"]].corr('kendall'))

In [None]:
#media dos tipos para cada atributo
df.groupby(by="Tipo").mean()

In [None]:
#tabela de correlações
df.corr()

In [None]:
df.describe()