In [1]:
# See module unit 17.2.3 for original example

#At this point, we would have had a clean data set and functioning database
#Goal: Create a classification supervised machine learning model
#Data pre-processing----- to clean data, we need to encode the data into binary values (?) by encoding the data and then scaling and normalizing it
#See https://courses.bootcampspot.com/courses/532/pages/17-dot-6-1-encode-labels-with-pandas?module_item_id=147211 


In [2]:
# import libraries
import numpy as np
import pandas as pd
import joblib

# database
from config import pokedb_password

# machine learning libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Database Setup

In [3]:
#Reference: https://courses.bootcampspot.com/courses/532/pages/9-dot-1-5-getting-started-with-sqlalchemy?module_item_id=145577

#Create Engine
db_string = f"postgresql://postgres:{pokedb_password}@localhost:5432/pokemon_421_db"
engine = create_engine(db_string)

# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
Base.classes.keys()

['poke_data', 'poketypes', 'pokeagainst']

In [5]:
#Save References to each table
poke_data = Base.classes.poke_data

In [6]:
#Create session link from Python to the DB
session = Session(engine)

In [7]:
##########################
## Ready to query data! ##
##########################

In [8]:
#Test query
poke_data_results = []
poke_data_results = session.query(
    poke_data.english_name,
    poke_data.pokedex_number,
    poke_data.generation,
    poke_data.status,
    poke_data.species,
    poke_data.type_number,
    poke_data.type_1,
    poke_data.type_2,
    poke_data.height_m,
    poke_data.weight_kg,
    poke_data.abilities_number,
    poke_data.ability_1,
    poke_data.ability_2,
    poke_data.ability_hidden,
    poke_data.total_points,
    poke_data.hp,
    poke_data.attack,
    poke_data.defense,
    poke_data.sp_attack,
    poke_data.sp_defense,
    poke_data.speed,
    poke_data.catch_rate,
    poke_data.base_friendship,
    poke_data.base_experience,
    poke_data.growth_rate,
    poke_data.egg_type_number,
    poke_data.egg_type_1,
    poke_data.egg_type_2,
    poke_data.percentage_male,
    poke_data.egg_cycles,
    poke_data.against_normal,
    poke_data.against_fire,
    poke_data.against_water,
    poke_data.against_electric,
    poke_data.against_grass,
    poke_data.against_ice,
    poke_data.against_fight,
    poke_data.against_poison,
    poke_data.against_ground,
    poke_data.against_flying,
    poke_data.against_psychic,
    poke_data.against_bug,
    poke_data.against_rock,
    poke_data.against_ghost,
    poke_data.against_dragon,
    poke_data.against_dark,
    poke_data.against_steel,
    poke_data.against_fairy,
)
print(poke_data_results.all())



In [9]:
# Create a dataframe with all data
poke_df = pd.DataFrame(poke_data_results, columns=[
    'english_name',
    'pokedex_number',
    'generation',
    'status',
    'species',
    'type_number',
    'type_1',
    'type_2',
    'height_m',
    'weight_kg',
    'abilities_number',
    'ability_1',
    'ability_2',
    'ability_hidden',
    'total_points',
    'hp',
    'attack',
    'defense',
    'sp_attack',
    'sp_defense',
    'speed',
    'catch_rate',
    'base_friendship',
    'base_experience',
    'growth_rate',
    'egg_type_number',
    'egg_type_1',
    'egg_type_2',
    'percentage_male',
    'egg_cycles',
    'against_normal',
    'against_fire',
    'against_water',
    'against_electric',
    'against_grass',
    'against_ice',
    'against_fight',
    'against_poison',
    'against_ground',
    'against_flying',
    'against_psychic',
    'against_bug',
    'against_rock',
    'against_ghost',
    'against_dragon',
    'against_dark',
    'against_steel',
    'against_fairy'
])
poke_df

Unnamed: 0,english_name,pokedex_number,generation,status,species,type_number,type_1,type_2,height_m,weight_kg,...,against_ground,against_flying,against_psychic,against_bug,against_rock,against_ghost,against_dragon,against_dark,against_steel,against_fairy
0,Rattata,19,1,Normal,Mouse Pokémon,1,Normal,no_type_2,0.3,3.5,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
1,Caterpie,10,1,Normal,Worm Pokémon,1,Bug,no_type_2,0.3,2.9,...,0.5,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
2,Psyduck,54,1,Normal,Duck Pokémon,1,Water,no_type_2,0.8,19.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0
3,Golduck,55,1,Normal,Duck Pokémon,1,Water,no_type_2,1.7,76.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0
4,Mankey,56,1,Normal,Pig Monkey Pokémon,1,Fighting,no_type_2,0.5,28.0,...,1.0,2.0,2.0,0.5,0.5,1.0,1.0,0.5,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,Urshifu Rapid Strike Style,892,8,Sub Legendary,Wushu Pokémon,2,Fighting,Water,1.9,105.0,...,1.0,2.0,2.0,0.5,0.5,1.0,1.0,0.5,0.5,2.0
1041,Zarude,893,8,Mythical,Rogue Monkey Pokémon,2,Dark,Grass,1.8,70.0,...,0.5,2.0,0.0,4.0,1.0,0.5,1.0,0.5,1.0,2.0
1042,Calyrex,898,8,Legendary,King Pokémon,2,Psychic,Grass,1.1,7.7,...,0.5,2.0,0.5,4.0,1.0,2.0,1.0,2.0,1.0,1.0
1043,Calyrex Ice Rider,898,8,Legendary,High King Pokémon,2,Psychic,Ice,2.4,809.1,...,1.0,1.0,0.5,2.0,2.0,2.0,1.0,2.0,2.0,1.0


## Clean data for machine learning module

In [10]:
# Ensure no null values

## Which columns have null values
poke_df.isnull().sum()

english_name          0
pokedex_number        0
generation            0
status                0
species               0
type_number           0
type_1                0
type_2                0
height_m              0
weight_kg             1
abilities_number      0
ability_1             0
ability_2             0
ability_hidden        0
total_points          0
hp                    0
attack                0
defense               0
sp_attack             0
sp_defense            0
speed                 0
catch_rate           18
base_friendship     115
base_experience     120
growth_rate           0
egg_type_number       0
egg_type_1            3
egg_type_2          760
percentage_male     173
egg_cycles            1
against_normal        0
against_fire          0
against_water         0
against_electric      0
against_grass         0
against_ice           0
against_fight         0
against_poison        0
against_ground        0
against_flying        0
against_psychic       0
against_bug     

In [11]:
# Change null values

poke_df['weight_kg'] = poke_df['weight_kg'].fillna(0)
poke_df['catch_rate'] = poke_df['catch_rate'].fillna(0)
poke_df['base_friendship'] = poke_df['base_friendship'].fillna(0)
poke_df['base_experience'] = poke_df['base_experience'].fillna(0)
poke_df['egg_type_1'] = poke_df['egg_type_1'].fillna('no_egg_type_1')
poke_df['egg_type_2'] = poke_df['egg_type_2'].fillna('no_egg_type_2')
poke_df['percentage_male'] = poke_df['percentage_male'].fillna(0)
poke_df['egg_cycles'] = poke_df['egg_cycles'].fillna(0)

In [12]:
# Confirm no more null values

poke_df.isnull().sum()

english_name        0
pokedex_number      0
generation          0
status              0
species             0
type_number         0
type_1              0
type_2              0
height_m            0
weight_kg           0
abilities_number    0
ability_1           0
ability_2           0
ability_hidden      0
total_points        0
hp                  0
attack              0
defense             0
sp_attack           0
sp_defense          0
speed               0
catch_rate          0
base_friendship     0
base_experience     0
growth_rate         0
egg_type_number     0
egg_type_1          0
egg_type_2          0
percentage_male     0
egg_cycles          0
against_normal      0
against_fire        0
against_water       0
against_electric    0
against_grass       0
against_ice         0
against_fight       0
against_poison      0
against_ground      0
against_flying      0
against_psychic     0
against_bug         0
against_rock        0
against_ghost       0
against_dragon      0
against_da

# Data Preprocessing for Machine Learning

In [13]:
#Drop columns irrelevant to machine learning
poke_df = poke_df.drop(columns=['english_name','pokedex_number', 'generation','type_number','abilities_number'])

In [14]:
#Define the dependent variable (variable Y) and independent variables (x). Note: Our Y-variable is a categorical variable.

y = poke_df['type_1']
x = poke_df.drop(columns=['type_1'])

In [15]:
# Ensure no mix of string and number values in any column

def ismixed(a):
    try:
        max(a)
        return False
    except TypeError as e: # we take this to imply mixed type
        msg, fst, and_, snd = str(e).rsplit(' ', 3)
        assert msg=="'>' not supported between instances of"
        assert and_=="and"
        assert fst!=snd
        return True
    except ValueError as e: # catch empty arrays
        assert str(e)=="max() arg is an empty sequence"
        return False
    
for column in list(x.columns):
    print(ismixed(x[column]),column)

False status
False species
False type_2
False height_m
False weight_kg
False ability_1
False ability_2
False ability_hidden
False total_points
False hp
False attack
False defense
False sp_attack
False sp_defense
False speed
False catch_rate
False base_friendship
False base_experience
False growth_rate
False egg_type_number
False egg_type_1
False egg_type_2
False percentage_male
False egg_cycles
False against_normal
False against_fire
False against_water
False against_electric
False against_grass
False against_ice
False against_fight
False against_poison
False against_ground
False against_flying
False against_psychic
False against_bug
False against_rock
False against_ghost
False against_dragon
False against_dark
False against_steel
False against_fairy


In [16]:
# Encode all data

oe = OrdinalEncoder()
x_enc = oe.fit_transform(x)

In [17]:
####################################
# Create Training and Testing Sets #
####################################
#Split data into X and Y train and test sets using sklearn.model's train_test_split method. a total of 4 sets.

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_enc,
   y,  random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(783, 42)
(262, 42)
(783,)
(262,)


In [18]:
x_train

array([[  2.,  47.,  18., ...,   2.,   2.,   3.],
       [  1., 575.,  18., ...,   2.,   2.,   3.],
       [  2.,  88.,   6., ...,   4.,   2.,   2.],
       ...,
       [  2., 367.,  18., ...,   4.,   3.,   3.],
       [  2., 297.,  18., ...,   2.,   2.,   3.],
       [  2., 282.,  18., ...,   1.,   3.,   4.]])

# Training Machine Learning Model


In [19]:
#Nearest Neighbor : https://scikit-learn.org/stable/modules/neighbors.html
#from sklearn.neighbors import NearestNeighbors

In [20]:
####################
# Random Forests #
####################

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report



In [21]:
# Create a random forest classifier.
#grid search cv 

rf_model = RandomForestClassifier(n_estimators=128, random_state=78)  

## There are 651 unique species of pokemon. Does that change the n_estimators?

In [22]:
x_enc

array([[  2., 375.,  18., ...,   2.,   3.,   3.],
       [  2., 647.,  18., ...,   2.,   3.,   3.],
       [  2., 173.,  18., ...,   2.,   2.,   3.],
       ...,
       [  0., 306.,   9., ...,   4.,   3.,   3.],
       [  0., 268.,  11., ...,   4.,   5.,   3.],
       [  0., 268.,   8., ...,   6.,   3.,   3.]])

In [23]:
# Fitting the model
rf_model = rf_model.fit(x_train, y_train)

In [24]:
# Making predictions using the testing data.
rf_predictions = rf_model.predict(x_test)
print(rf_predictions)

['Bug' 'Water' 'Fighting' 'Psychic' 'Bug' 'Bug' 'Normal' 'Water' 'Psychic'
 'Ghost' 'Normal' 'Water' 'Psychic' 'Grass' 'Electric' 'Poison' 'Rock'
 'Steel' 'Dragon' 'Water' 'Electric' 'Poison' 'Bug' 'Grass' 'Water'
 'Normal' 'Normal' 'Normal' 'Rock' 'Dark' 'Fire' 'Bug' 'Bug' 'Fire' 'Bug'
 'Water' 'Psychic' 'Normal' 'Rock' 'Poison' 'Normal' 'Grass' 'Fighting'
 'Electric' 'Fire' 'Water' 'Ice' 'Water' 'Normal' 'Normal' 'Water'
 'Ground' 'Bug' 'Ground' 'Bug' 'Water' 'Dragon' 'Rock' 'Rock' 'Grass'
 'Grass' 'Fighting' 'Grass' 'Fire' 'Fire' 'Water' 'Dark' 'Fighting'
 'Ground' 'Normal' 'Psychic' 'Rock' 'Bug' 'Psychic' 'Normal' 'Rock' 'Dark'
 'Water' 'Rock' 'Fire' 'Fighting' 'Water' 'Water' 'Water' 'Grass'
 'Electric' 'Dark' 'Grass' 'Ice' 'Steel' 'Steel' 'Electric' 'Poison'
 'Grass' 'Steel' 'Fire' 'Fire' 'Fairy' 'Ground' 'Poison' 'Grass' 'Poison'
 'Water' 'Dark' 'Ground' 'Bug' 'Water' 'Fighting' 'Normal' 'Electric'
 'Water' 'Rock' 'Rock' 'Grass' 'Fighting' 'Grass' 'Electric' 'Water'
 'Grass' 'Ps

In [25]:
# Calculating the confusion matrix.
## pass either a dictionary of arrays or an array of dictionaries as rf_cm


rf_cm = confusion_matrix(y_test, rf_predictions)
#rf_cm = {rf_cm[0]}


# Create a DataFrame from the confusion matrix.
rf_cm_df = pd.DataFrame(
    rf_cm)

rf_cm_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
2,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,15,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,13,0,0,0,0,0,0,0,0,1,0,0,1
6,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,10,1,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0


In [26]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, rf_predictions)
print(acc_score)

0.9236641221374046


In [27]:
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
2,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,15,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,13,0,0,0,0,0,0,0,0,1,0,0,1
6,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,10,1,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0


Accuracy Score : 0.9236641221374046
Classification Report
              precision    recall  f1-score   support

         Bug       0.84      1.00      0.91        16
        Dark       0.90      0.82      0.86        11
      Dragon       1.00      1.00      1.00         7
    Electric       1.00      0.88      0.94        17
       Fairy       1.00      1.00      1.00         4
    Fighting       1.00      0.81      0.90        16
        Fire       0.76      0.93      0.84        14
      Flying       1.00      1.00      1.00         3
       Ghost       1.00      0.83      0.91        12
       Grass       0.91      1.00      0.95        20
      Ground       0.92      0.92      0.92        12
         Ice       0.86      0.75      0.80         8
      Normal       1.00      1.00      1.00        35
      Poison       1.00      1.00      1.00         9
     Psychic       0.93      0.76      0.84        17
        Rock       0.88      1.00      0.94        15
       Steel       0.64

In [28]:
# save trained machine learning module
joblib.dump(rf_model,"rf.joblib")

['rf.joblib']