In [18]:
import os
import shutil

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.preprocessing import image
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt

import re
from tqdm import tqdm

In [3]:
# Process Pokémon and one-hot encode the types

pokemon = pd.read_csv('../data/external/stats/pokemon.csv')
pokemon["Variety"] = float("NaN")
pokemon = pokemon[["#", "Name", "Variety", "Type 1", "Type 2"]]

# Create one-hot columns for each type
types = set(pokemon["Type 1"])
for t in types:
    pokemon["is" + str(t)] = 0

# Iterate over Pokémon
for i, p in pokemon.iterrows():
    
    # Clean up name
    pokemonName = pokemon.loc[i, "Name"]
    pokemonName = pokemonName.replace(" ", "")
    
    pokemonNameComponents = re.findall('[A-Z][^A-Z0-9]*', pokemonName)

    pokemon.loc[i, "Name"] = pokemonNameComponents[0]
    
    # Set variety (Mega etc.)
    if len(pokemonNameComponents) > 1:
        pokemonNameComponents = [value for value in pokemonNameComponents if value != pokemonNameComponents[0]]
        pokemon.loc[i, "Variety"] = str(pokemonNameComponents).lower()
    else:
        pokemon.loc[i, "Variety"] = "normal"
    
    #  Set one-hot columns to 1 for relevant types
    pokemon.loc[i, "is" + p["Type 1"]] = 1

    if not pd.isna(p["Type 2"]):
        pokemon.loc[i, "is" + p["Type 2"]] = 1

# Ditch all Pokemon with varieties (e.g. Megas):
pokemon = pokemon[pokemon["Variety"] == "normal"]


# Save output
pokemon.to_csv('../data/processed/pokemon.csv', index=False)

pokemon.head()

Unnamed: 0,#,Name,Variety,Type 1,Type 2,isGhost,isFighting,isFire,isFlying,isGround,...,isGrass,isNormal,isSteel,isBug,isDark,isPoison,isDragon,isElectric,isFairy,isIce
0,1,Bulbasaur,normal,Grass,Poison,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,2,Ivysaur,normal,Grass,Poison,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,3,Venusaur,normal,Grass,Poison,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,4,Charmander,normal,Fire,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,Charmeleon,normal,Fire,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Process image data

data_directory_images = "../data/external/images-gen-1-6"
output_directory = "../data/processed/pokemon"

pokemon = pd.read_csv('../data/processed/pokemon.csv')
pokemon["imagePath"] = np.nan

# Remove processed folder and create empty new one
try:
    shutil.rmtree(output_directory)
    os.mkdir(output_directory)
except:
    os.mkdir(output_directory)

# Copy images to processed folder
for image in os.listdir(data_directory_images):
    pokemon_id = image.split('.')[0]

    # Images with no variety (e.g. "211.png")
    if pokemon_id.isnumeric():
        
        # Copy to processed folder
        src = data_directory_images + "/" + image
        dst = output_directory + "/" + pokemon_id + ".png"

        shutil.copyfile(src, dst)

        # Set image path in data frame
        pokemon.loc[pokemon["#"] == int(pokemon_id), 'imagePath'] = dst
        

# Drop Pokemon without image path
pokemon = pokemon.dropna(subset=["imagePath"])

# Save pokemon.csv with image paths
pokemon.to_csv('../data/processed/pokemon-with-image-paths.csv', index=False)

pokemon.head()

Unnamed: 0,#,Name,Variety,Type 1,Type 2,isGhost,isFighting,isFire,isFlying,isGround,...,isNormal,isSteel,isBug,isDark,isPoison,isDragon,isElectric,isFairy,isIce,imagePath
0,1,Bulbasaur,normal,Grass,Poison,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,../data/processed/pokemon/1.png
1,2,Ivysaur,normal,Grass,Poison,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,../data/processed/pokemon/2.png
2,3,Venusaur,normal,Grass,Poison,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,../data/processed/pokemon/3.png
3,4,Charmander,normal,Fire,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,../data/processed/pokemon/4.png
4,5,Charmeleon,normal,Fire,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,../data/processed/pokemon/5.png


In [10]:
# Load images as training data
train_image = []

for i in tqdm(range(pokemon.shape[0])):
    
    img = tf.keras.utils.load_img(pokemon.iloc[i]["imagePath"])
    img = tf.keras.utils.img_to_array(img)
    img = img/255
    train_image.append(img)
X = np.array(train_image)

X.shape

100%|████████████████████████████████████████████████████████████████████████████████| 693/693 [00:01<00:00, 550.47it/s]


(693, 256, 256, 3)

In [16]:
# Create labels

y = np.array(pokemon.drop(['#', 'Name', 'Variety', 'Type 1', 'Type 2', 'imagePath'], axis=1))

y.shape

(693, 18)

In [19]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)