# Creating a Machine Learning model to predict Pokemon types

### Import packages

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

### Create pandas dataframe from database

In [2]:
# Specify the path to the .db file
db_path = "C:\\Users\\agomu\\Python\\github\\poke_api\\pokemon.db"

# Establish a connection to the SQLite database
conn = sqlite3.connect(db_path)

# Write your SQL query to select data from the desired table(s)
query = """SELECT * FROM pokemon_tbl
        WHERE generation_name IN ('generation-i', 'generation-ii', 'generation-iii')
"""

# Use pandas.read_sql() to execute the query and retrieve the data as a DataFrame
df = pd.read_sql(query, conn)

# Close the database connection
conn.close()



### Inspect dataframe

In [3]:
df.describe()

Unnamed: 0,id,height,pokemon_species_id,weight,hp,attack,defense,special-attack,special-defense,speed,evolution_chain_id,gender_rate,capture_rate,base_happiness,hatch_counter
count,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0,509.0
mean,2590.544204,18.35167,181.842829,855.640472,66.660118,77.084479,72.011788,70.447937,70.555992,69.277014,87.277014,3.131631,101.090373,53.722986,26.29666
std,4251.776344,38.222928,116.325597,1782.387057,27.133898,32.335208,32.829251,32.692912,28.185328,29.879744,61.360782,1.9922,77.977534,17.324482,22.779074
min,1.0,2.0,1.0,1.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0,-1.0,3.0,0.0,5.0
25%,128.0,6.0,79.0,104.0,50.0,55.0,50.0,45.0,50.0,45.0,31.0,2.0,45.0,50.0,20.0
50%,255.0,11.0,173.0,325.0,65.0,75.0,69.0,65.0,70.0,70.0,79.0,4.0,60.0,50.0,20.0
75%,382.0,16.0,283.0,750.0,80.0,95.0,90.0,90.0,85.0,90.0,140.0,4.0,190.0,70.0,20.0
max,10253.0,350.0,386.0,10000.0,255.0,190.0,230.0,194.0,230.0,180.0,202.0,8.0,255.0,140.0,120.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   name                     509 non-null    object
 1   id                       509 non-null    int64 
 2   height                   509 non-null    int64 
 3   base_experience          509 non-null    object
 4   pokemon_species_id       509 non-null    int64 
 5   weight                   509 non-null    int64 
 6   hp                       509 non-null    int64 
 7   attack                   509 non-null    int64 
 8   defense                  509 non-null    int64 
 9   special-attack           509 non-null    int64 
 10  special-defense          509 non-null    int64 
 11  speed                    509 non-null    int64 
 12  evolution_chain_id       509 non-null    int64 
 13  evolves_from_species_id  509 non-null    object
 14  gender_rate              509 non-null    i

### Add new features needed for model

In [23]:
# Total type is the concatenation of type_name_1 and type_name_2, so we can look at the overall pokemon type. i.e Bug/Poision
df['totaltype'] = df.apply(lambda row: row['type_name_1'] + ('/' + row['type_name_2'] if row['type_name_2'] else ''), axis=1)

print(df['totaltype'])

0      grass/poison
1      grass/poison
2      grass/poison
3              fire
4              fire
           ...     
493    ghost/poison
494           water
495       water/ice
496          normal
497          normal
Name: totaltype, Length: 498, dtype: object


In [24]:
# twotypes is a boolean column indicating whether or not a pokemon has one type or two types
df['twotypes'] = df.apply(lambda row: 1 if row['type_name_1'] and row['type_name_2'] else 0, axis=1)
print(df['twotypes'])

0      1
1      1
2      1
3      0
4      0
      ..
493    1
494    0
495    1
496    0
497    0
Name: twotypes, Length: 498, dtype: int64


In [5]:
df.head()

Unnamed: 0,name,id,height,base_experience,pokemon_species_id,weight,hp,attack,defense,special-attack,...,capture_rate,base_happiness,hatch_counter,growth_rate,generation_name,habitat_name,type_name_1,type_name_2,totaltype,twotypes
0,bulbasaur,1,7,64,1,69,45,49,49,65,...,45,50,20,medium-slow,generation-i,grassland,grass,poison,"grass,poison",True
1,ivysaur,2,10,142,2,130,60,62,63,80,...,45,50,20,medium-slow,generation-i,grassland,grass,poison,"grass,poison",True
2,venusaur,3,20,263,3,1000,80,82,83,100,...,45,50,20,medium-slow,generation-i,grassland,grass,poison,"grass,poison",True
3,charmander,4,6,62,4,85,39,52,43,60,...,45,50,20,medium-slow,generation-i,mountain,fire,,fire,False
4,charmeleon,5,11,142,5,190,58,64,58,80,...,45,50,20,medium-slow,generation-i,mountain,fire,,fire,False


### Data cleaning

In [25]:
# removing null rows from base_experience column
df = df[df['base_experience'] != '']

In [17]:
empty_counts = (df == '').sum()
print(empty_counts)

name                         0
id                           0
height                       0
base_experience              0
pokemon_species_id           0
weight                       0
hp                           0
attack                       0
defense                      0
special-attack               0
special-defense              0
speed                        0
evolution_chain_id           0
evolves_from_species_id    235
gender_rate                  0
has_gender_differences       0
is_baby                      0
is_legendary                 0
is_mythical                  0
capture_rate                 0
base_happiness               0
hatch_counter                0
growth_rate                  0
generation_name              0
habitat_name                 0
type_name_1                  0
type_name_2                251
totaltype                    0
twotypes                     0
dtype: int64


## The model creation and evaluation

In [22]:
# preprocess data
X = df[['height', 'weight', 'base_experience', 'hp', 'attack', 'defense', 'special-attack', 'special-defense', 'speed', 'capture_rate', 'twotypes']]
y = df['type_name_1']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

         bug       0.50      0.33      0.40        12
        dark       0.00      0.00      0.00         4
      dragon       1.00      0.67      0.80         3
    electric       0.75      0.86      0.80         7
       fairy       0.00      0.00      0.00         2
    fighting       1.00      0.33      0.50         3
        fire       0.29      0.25      0.27         8
       ghost       0.00      0.00      0.00         4
       grass       0.20      0.25      0.22         8
      ground       0.00      0.00      0.00         3
         ice       0.00      0.00      0.00         3
      normal       0.50      0.54      0.52        13
      poison       0.00      0.00      0.00         4
     psychic       0.22      0.50      0.31         4
        rock       0.60      0.50      0.55         6
       steel       0.00      0.00      0.00         2
       water       0.15      0.29      0.20        14

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Things to add:
- Evolved: yes or no
    - if evolve_from_species_id is null, 0, else 1
- Baby, Legendary, Mythical tagging
    - 3 columns: baby, legendary, mythical
        - 1 if yes, 0 if no
- Quantify growth rate
    - Fast, Medium, Medium-slow, slow
       -  4, 3, 2, 1 respectively
- Quantify generation_name
   -  Gen 1, Gen 2, Gen 3
        - 1, 2, 3 respectively