# Predicting Pokemon Types
- Encode categorical features that will be used as predictor and response variables.
- Try out different models and see which ones have the best accuracy scores out of the box.
- Pick one or two of the best performers and tune the models to improve accuracy.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import LabelPowerset
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Prevent warnings from appearing
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change the amount of rows shown in printed dataframes
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [4]:
# Read in pokedex csv
pokedex = pd.read_csv('../data/pokedex_merged.csv')
pokedex = pokedex.where(pd.notnull(pokedex), 'None')

## Feature Engineering
1. Need to encode both primary and secondary types to use as response variables.
2. Need to encode egg groups to use as predictor variables.

In [5]:
# Putting types and egg groups into one column as a list and adding them as new columns to Pokedex
types_list_of_lists = []
egg_list_of_lists = []

for i in range(0, len(pokedex)):
    if pokedex['secondary_type'].iloc[i] == 'None':
        type_list = [pokedex['primary_type'].iloc[i]]
    else:
        type_list = [pokedex['primary_type'].iloc[i], pokedex['secondary_type'].iloc[i]]
    types_list_of_lists.append(type_list)
    
for i in range(0, len(pokedex)):
    if pokedex['egg_group_2'].iloc[i] == 'None':
        if pokedex['egg_group_1'].iloc[i] == 'None':
            egg_list = []
        egg_list = [pokedex['egg_group_1'].iloc[i]]
    else:
        egg_list = [pokedex['egg_group_1'].iloc[i], pokedex['egg_group_2'].iloc[i]]
    egg_list_of_lists.append(egg_list)

pokedex['type'] = types_list_of_lists
pokedex['egg'] = egg_list_of_lists

In [6]:
# Instantiate MultiLabelBinarizer objects and fit them to type and egg columns
mlb = MultiLabelBinarizer()
type_mlb = mlb.fit_transform(pokedex['type'])

mlb2 = MultiLabelBinarizer()
egg_mlb = mlb2.fit_transform(pokedex['egg'])

In [7]:
pokedex = pokedex.join(pd.DataFrame(type_mlb, columns=list(mlb.classes_)))
pokedex = pokedex.join(pd.DataFrame(egg_mlb, columns=list(mlb2.classes_)), rsuffix='_egg')

## Models to Build and Test
1. Decision Tree 
2. K-nearest Neighbors
3. Logistic Regression
4. Multinomial Naive Bayes
5. Neural Network
6. Random Forest
7. Support Vector Machines

In [8]:
def build_model(model,mlb_estimator, X_train, y_train, X_test):
    clf = mlb_estimator(model)
    clf.fit(X_train, y_train)
    clf_predictions = clf.predict(X_test)
    return clf_predictions

In [9]:
# Variables for train/test split function
tst_size = 0.3
seed = 9815

# Separate data by response and variable data
X = pokedex[list(pokedex.columns)[17:35] + list(pokedex.columns)[56:]]
y = pokedex[list(pokedex.columns)[38:56]]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=tst_size, 
                                                    random_state=seed)

In [10]:
# Instantiate models, fit them with training set, and predict values with the testing set
dec_tree_pred = build_model(DecisionTreeClassifier(), LabelPowerset, X_train, y_train, X_test)
knn_pred = build_model(KNeighborsClassifier(), LabelPowerset, X_train, y_train, X_test)
logreg_pred = build_model(LogisticRegression(), LabelPowerset, X_train, y_train, X_test)
multinb_pred = build_model(MultinomialNB(), LabelPowerset, X_train, y_train, X_test)
nn_pred = build_model(MLPClassifier(), LabelPowerset, X_train, y_train, X_test)
rand_forest_pred = build_model(RandomForestClassifier(), LabelPowerset, X_train, y_train, X_test)
svc_pred = build_model(SVC(), LabelPowerset, X_train, y_train, X_test)

In [11]:
# Print accuracy score and F1-scores for each model
dec_tree_acc = round(accuracy_score(y_test, dec_tree_pred) * 100, 2)
dec_tree_f1 = round(f1_score(y_test, dec_tree_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Decision Tree is: ' + str(dec_tree_acc) + '%')
print('The F1-score we have achieved using Decision Tree is: ' + str(dec_tree_f1) + '%')
print('\n')

knn_acc = round(accuracy_score(y_test, knn_pred) * 100, 2)
knn_f1 = round(f1_score(y_test, knn_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using K Nearest Neighbors is: ' + str(knn_acc) + '%')
print('The F1-score we have achieved using K Nearest Neighbors is: ' + str(knn_f1) + '%')
print('\n')

logreg_acc = round(accuracy_score(y_test, logreg_pred) * 100, 2)
logreg_f1 = round(f1_score(y_test, logreg_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Logistic Regression is: ' + str(logreg_acc) + '%')
print('The F1-score we have achieved using Logistic Regression is: ' + str(logreg_f1) + '%')
print('\n')

multinb_acc = round(accuracy_score(y_test, multinb_pred) * 100, 2)
multinb_f1 = round(f1_score(y_test, multinb_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Multinomial Naive Bayes is: ' + str(multinb_acc) + '%')
print('The F1-score we have achieved using Multinomial Naive Bayes is: ' + str(multinb_f1) + '%')
print('\n')

nn_acc = round(accuracy_score(y_test, nn_pred) * 100, 2)
nn_f1 = round(f1_score(y_test, nn_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Neural Network is: ' + str(nn_acc) + '%')
print('The F1-score we have achieved using Neural Network is: ' + str(nn_f1) + '%')
print('\n')

rand_forest_acc = round(accuracy_score(y_test, rand_forest_pred) * 100, 2)
rand_forest_f1 = round(f1_score(y_test, rand_forest_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Random Forest is: ' + str(rand_forest_acc) + '%')
print('The F1-score we have achieved using Random Forest is: ' + str(rand_forest_f1) + '%')
print('\n')

svc_acc = round(accuracy_score(y_test, svc_pred) * 100, 2)
svc_f1 = round(f1_score(y_test, svc_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Support Vector Classification is: ' + str(svc_acc) + '%')
print('The F1-score we have achieved using Support Vector Classification is: ' + str(svc_f1) + '%')

The accuracy score we have achieved using Decision Tree is: 51.7%
The F1-score we have achieved using Decision Tree is: 68.85%


The accuracy score we have achieved using K Nearest Neighbors is: 44.9%
The F1-score we have achieved using K Nearest Neighbors is: 70.44%


The accuracy score we have achieved using Logistic Regression is: 59.52%
The F1-score we have achieved using Logistic Regression is: 77.56%


The accuracy score we have achieved using Multinomial Naive Bayes is: 53.06%
The F1-score we have achieved using Multinomial Naive Bayes is: 75.83%


The accuracy score we have achieved using Neural Network is: 63.61%
The F1-score we have achieved using Neural Network is: 80.63%


The accuracy score we have achieved using Random Forest is: 61.9%
The F1-score we have achieved using Random Forest is: 79.12%


The accuracy score we have achieved using Support Vector Classification is: 49.32%
The F1-score we have achieved using Support Vector Classification is: 72.52%


### Results from Different Models
#### Conclusion:
- Neural Network is consistenly the best performing model in predicting Pokemon's types across multiple seeds. Will tune the hyperparameters of the Neural Network model to improve accuracy.

## Model Tuning
### Will tune the following hyperparameters for the Neural Network model:
- Number of Hidden Layers and Neurons per Layer (hidden_layer_sizes) = Keep hidden layer to one but vary number of neurons to see if we van better fit data and improve accuracy.
- Activation Function (activation) = Since this is a non-linear classification, try out different activation functions to manipulate the weights as they are leaving neurons.
- Solver = Choose algorithm for weight optimization across nodes.
- Learning rate (learning_rate and learning_rate_init) = Stabalize training process by picking a learning rate that helps the network converge to an output. Need something low enough to where it converges to something useful but large enough to where it doesn't take alot of time.
- Momentum (momentum) = Control speed of gradient descent. Improve training time while maintaining accuracy.
- Number of epochs (max_iter) = Purpose is to increase the number of times the whole training set is shown to the network while training.
- Batch size (batch_size) = Control number of minibatches that will be used to train the dataset.

In [17]:
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (300,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive', 'invscaling'],
    'learning_rate_init': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
    'momentum': [0.5, 0.6, 0.7, 0.8, 0.9],
    'max_iter': [200, 1000, 5000, 10000], 
    'batch_size': [32, 64, 128, 256]
}

In [18]:
nn = MLPClassifier()

nn_random = RandomizedSearchCV(estimator=nn, 
                               param_distributions=param_grid, 
                               n_iter=50, 
                               cv=3, 
                               verbose=2, 
                               random_state=seed, 
                               n_jobs=-1)

In [19]:
LabelPowerset(nn_random).fit(X_train, y_train);

Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 21.5min finished


In [20]:
nn_random.best_params_

{'solver': 'adam',
 'momentum': 0.8,
 'max_iter': 200,
 'learning_rate_init': 0.0003,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (200,),
 'batch_size': 32,
 'activation': 'logistic'}

In [21]:
nn_pred = build_model(MLPClassifier(**nn_random.best_params_), LabelPowerset, X_train, y_train, X_test)
nn_acc = round(accuracy_score(y_test, nn_pred) * 100, 2)
nn_f1 = round(f1_score(y_test, nn_pred, average='macro') * 100, 2)
print('The accuracy score we have achieved using Neural Network is: ' + str(nn_acc) + '%')
print('The F1-score we have achieved using Neural Network is: ' + str(nn_f1) + '%')
print('\n')

The accuracy score we have achieved using Neural Network is: 62.59%
The F1-score we have achieved using Neural Network is: 80.51%


