# Exploring Classification Methods with Pokemon

**Classification Alogirthims**
- Decision Tree
- Naive Bayes
- Random Forest Algorithim

### Import Preliminaries

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import accuracy_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold

from keras import models
from keras import layers
from keras import optimizers
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
warnings.filterwarnings('ignore')

Using TensorFlow backend.


### Import Data

In [7]:
# Import Data
df = pd.read_csv('Data/pokemon.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
#             800 non-null int64
Name          800 non-null object
Type 1        800 non-null object
Type 2        414 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [9]:
# Cleaning Data
print(f"DataFrame Shape: {df.shape[0],df.shape[1]}")
cdf = df[df['Legendary'] == False]
cdf[['Type 1', 'Type 2']] = cdf[['Type 1', 'Type 2']].fillna(value='')
cdf['Type'] = cdf.loc[:,'Type 1'] + " " + cdf.loc[:,'Type 2']
cdf = cdf.drop(['Legendary','Name','#','Type 1','Type 2'], axis=1)
cdf['Type'] =  cdf['Type'].astype('category').cat.codes
#cdf['Type 2'] =  cdf['Type 2'].astype('category').cat.codes
print(f"DataFrame Shape: {cdf.shape[0],cdf.shape[1]}")

DataFrame Shape: (800, 13)
DataFrame Shape: (735, 9)


In [10]:
train_y = cdf.Type.values
train_x = cdf.drop(['Type'], axis=1).values

### Decision Tree

In [11]:
model = DecisionTreeClassifier()
model.fit(train_x, train_y)
y_pred = model.predict(train_x)

crossvalidation = KFold(n_splits=10, random_state=1)
scores = cross_val_score(model, train_x, train_y, scoring ='accuracy', 
                         cv=crossvalidation, n_jobs =1)

print('Folds: %i, accuracy: %.4f std: %.2f' % 
      (len(scores), np.mean(np.abs(scores)), np.std(scores)))

Folds: 10, accuracy: 0.0652 std: 0.04


### Naive Bayes

In [12]:
model = GaussianNB()
model.fit(train_x, train_y)
y_pred = model.predict(train_x)

crossvalidation = KFold(n_splits=10, random_state=1)
scores = cross_val_score(model, train_x, train_y, scoring ='accuracy', 
                         cv=crossvalidation, n_jobs =1)

print('Folds: %i, accuracy: %.8f std: %.2f' % 
      (len(scores), np.mean(np.abs(scores)), np.std(scores)))

Folds: 10, accuracy: 0.04616809 std: 0.03


### Random Forest

In [13]:

model = RandomForestClassifier()
model.fit(train_x, train_y)
y_pred = model.predict(train_x)

crossvalidation = KFold(n_splits=10, random_state=1)
scores = cross_val_score(model, train_x, train_y, scoring ='accuracy', 
                         cv=crossvalidation, n_jobs =1)

print('Folds: %i, accuracy: %.4f std: %.2f' % 
      (len(scores), np.mean(np.abs(scores)), np.std(scores)))

Folds: 10, accuracy: 0.0816 std: 0.03


### CNN Classifier

In [14]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results
  
def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, labels] = 1
    return results

In [15]:
from keras.utils.np_utils import to_categorical
train_y = to_categorical(train_y)

print(f'Training Data Shape: {train_x.shape[0],train_x.shape[1]}')
print(f'Training Data Shape: {train_y.shape[0],train_y.shape[1]}')

Training Data Shape: (735, 8)
Training Data Shape: (735, 139)


In [18]:
model = models.Sequential()
model.add(layers.Dense(4, activation='relu',input_shape=(train_x.shape[1],)))
model.add(layers.Dense(139, activation='relu'))
model.add(layers.Dense(train_y.shape[1], activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(train_x, train_y, epochs=20, batch_size=10, verbose=0)

<keras.callbacks.History at 0x1a250cd780>

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_5 (Dense)              (None, 139)               695       
_________________________________________________________________
dense_6 (Dense)              (None, 139)               19460     
Total params: 20,191
Trainable params: 20,191
Non-trainable params: 0
_________________________________________________________________


In [33]:
from keras.wrappers.scikit_learn import KerasClassifier

skmodel = KerasClassifier(build_fn=model, 
                                 epochs=10, 
                                 batch_size=100, 
                                 verbose=0)

# Evaluate neural network using three-fold cross-validation
cross_val_score(skmodel, train_x, train_y, cv=10, n_jobs=1)

TypeError: can't pickle _thread.RLock objects