# Creating Pokedex

Getting Pokemon data from PokemonDB. Fills out the Pokedex with name, pokedex number, number of images, and the pokedex URL. 

In [1]:
import requests
import re
import json

def get_generation(pokedex_num):
    """Returns what generation a Pokemon belongs to based on pokedex number"""
    if pokedex_num <= 151:
        return 1
    elif pokedex_num <= 251:
        return 2
    elif pokedex_num <= 386:
        return 3
    elif pokedex_num <= 493:
        return 4
    elif pokedex_num <= 649:
        return 5
    elif pokedex_num <= 721:
        return 6
    elif pokedex_num <= 809:
        return 7
    elif pokedex_num <= 898:
        return 8


def get_pokemon():
    """Scrapes Pokemondb and returns a pokedex with pokedex number as ID and name, generation, number of images, and sprite url as values """
    page = requests.get(f"https://pokemondb.net/pokedex/national").text
    poke_cards = re.findall('<div [^>]+>(.+?)(?=</div>)', page)
    pokedex = {}
    for card in poke_cards:
        img_url = re.findall('data\-src\="([^"]+)', card)[0]
        name = re.findall('class\="ent-name"[^>]+>([^<]+)', card)[0].strip()
        pokedex_num = int(re.findall('\#([^<]+)', card)[0].lstrip('0'))
        generation = get_generation(pokedex_num)
        pokedex[pokedex_num] = {
            "Name": name, "Generation": generation, "# of Training Images": 0, "# of Testing Images": 0, "Image URL": img_url}
    return pokedex


pokedex = get_pokemon()
with open('pokemon_data.json', 'w', encoding='utf-8') as w:
    json.dump(pokedex, w, indent=2, ensure_ascii=False)


# Save functions 
Creating functions to save each collected image with the pokemon's pokedex label followed by the current number of images for that pokemon (to ensure each image has a unique name so we don't overwrite).

In [2]:
import os

def create_folder(folder_name):
    """If folder does not exist, create it"""
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)

def update_pokedex_counts(pokedex_num, key_name):
    """Updates the number of images for training/testing. Used for file naming purposes"""
    if key_name:
        num_images = pokedex[pokedex_num][key_name]
        pokedex[pokedex_num][key_name] +=1
        return num_images
    else:
        return ''


def save_image(img, save_directory, pokedex_num, num_image_key):
    """Resizes image and saves to specified directory"""
    create_folder(save_directory)
    num_images = update_pokedex_counts(pokedex_num, num_image_key)
    resized = img.resize((224, 224)) # Training resolution
    resized.save(f"./{save_directory}/{pokedex_num}_{num_images}.png")


# Getting PokemonDB sprites

In [3]:
import cv2
from PIL import Image
from io import BytesIO
import numpy as np
from tqdm import tqdm
import random


def create_template(image):
    """Changes blank background into a random color and blacks out pokemon. Serves as a form of data augmentation."""
    background_color = [random.randint(0, 255), random.randint(0, 255), random.randint(
        0, 255), random.randint(0, 255)]  # random background color
    pokemon_color = [random.randint(0, 255), random.randint(0, 255), random.randint(
        0, 255), random.randint(0, 255)]  # makes pokemon single color

    im_array = np.array(image)
    r, g, b, a = cv2.split(im_array)  # splitting to rgba channels separately
    black_areas = ((b == 0) & (g == 0) & (r == 0)) | (a == 0)
    non_black = ((b > 0) | (g > 0) | (r > 0)) & (a != 0)
    im_array[black_areas] = background_color
    im_array[non_black] = pokemon_color
    hidden = Image.fromarray(im_array)
    return hidden

def get_pokemonDB_images():
    """Loops through pokedex and gets the default pokemon images for testing. """
    background_color = [217, 19, 17, 255]  # shade of red
    pokemon_color = [0, 0, 0, 0]  # black
    for pokedex_num, item in tqdm(pokedex.items()):
        img_url = item['Image URL']
        content = requests.get(img_url).content
        img = Image.open(BytesIO(content)).convert('RGBA')
        hidden = create_template(img)
        save_image(img, 'training_data', pokedex_num,  '# of Training Images')
        save_image(hidden, 'training_data', pokedex_num,   '# of Training Images')


get_pokemonDB_images()


100%|██████████| 898/898 [01:41<00:00,  8.84it/s]


# Veekun Images
Loops through directory and resizes veekun images and saves. 

In [4]:

import re

def get_pokedex_num(file_name):
    """Veekun images are labeled based on Pokedex number. This simply returns that number if it exists"""
    if re.findall('^(\d+)', file_name) and not re.search('^0', file_name):
        pokedex_num = int(re.findall('^(\d+)', file_name)[0])
        return pokedex_num
    else:
        return None


def save_veekun_images(path):
    """Recursively loops through directories in veekum folder attempting to find labeled Pokemon images to use as training data."""
    directories = os.listdir(path)
    for directory in directories:
        if '.png' in directory:
            img = Image.open(f"{path}/{directory}").convert('RGBA')
            pokedex_num = get_pokedex_num(directory)
            if pokedex_num:
                save_image(img, 'training_data', pokedex_num, "# of Training Images")
                # A form of data augmentation that resembles a "Whos that pokemon card" loosely
                if 'black-white' in path:
                    for i in range(2):
                        hidden = create_template(img)
                        save_image(hidden, 'training_data', pokedex_num, "# of Training Images")
        elif '.svg' not in directory:
            save_veekun_images(f"{path}/{directory}")
save_veekun_images('veekun_images')

# Brave Images
Collects more variable images via scraping Brave Images. 

In [15]:
from selenium import webdriver
import time

prior_urls = [] # List to ensure there are no  duplicate urls

browser = webdriver.Firefox() # Opens Firefox to make requests

def loop_through_images(pokemon_name, img_urls, titles, save_path, pokedex_num):
    """Loops through images and image titles. Ensures that the pokemon name is in the title before saving to training dataset."""
    for img_url, title in zip(img_urls, titles):
        if pokemon_name in title.lower() and img_url not in prior_urls:
            try:
                content = requests.get(img_url).content
                img = Image.open(BytesIO(content)).convert('RGBA')
            except:
                continue
            save_image(img, save_path, pokedex_num, "# of Training Images")
            prior_urls.append(img_url) #keeps track of what images have been added to avoid duplicates


def get_images_and_titles(num_images):
    """returns first n images and titles from brave search results"""
    containers = [element for i, element in enumerate(browser.find_elements_by_class_name('box')) if i < num_images]
    for container in containers:
        img_url = container.find_element_by_class_name('image').get_attribute('src')
        title = container.find_element_by_class_name('img-title').text
        yield img_url, title


def get_and_save_brave_images(search_txt, n, pokemon_name, pokedex_num, save_path):
    browser.get(f"https://search.brave.com/images?q={search_txt}")  # getting page data
    time.sleep(3) # sleeps to load page
    elements = tuple(get_images_and_titles(n))
    if elements:
        img_urls, titles = zip(*elements)
        loop_through_images(pokemon_name, img_urls, titles, save_path, pokedex_num)
        time.sleep(2)
    else:
        time.sleep(35) # Wait so I have enough time to fill-in the captcha 
        get_and_save_brave_images(search_txt, n, pokemon_name, pokedex_num, save_path)

def get_search_txts(pokemon_name):
    """Creates three search texts for each pokemon name. (e.g., for bulbasaur, it'd search for the following string: 'pokemon bulbasaur art', 'pokemon bulbasaur card', 'pokemon bulbasaur wallpaper'). This can be expanded but worked well in my tests. """
    txts = ['art', 'card', 'wallpaper']
    for txt in txts:
        yield f"pokemon {pokemon_name} {txt}".strip().replace(' ', '+')


for pokedex_num, item in tqdm(pokedex.items()):
    pokemon_name = item['Name'].strip().lower()
    search_txts = list(get_search_txts(pokemon_name)) # list of txts to search for per pokemon
    num_images = (20,5,3) # Number of images to attempt to save per search
    prior_urls = [] # Ensures there are no duplicate URLS
    for search_txt, n in zip(search_txts, num_images):
        get_and_save_brave_images(search_txt, n, pokemon_name, pokedex_num,  'training_data')


100%|██████████| 898/898 [27:34<00:00,  1.84s/it]


# Test data
Converts SVG sprites from Veekrun into RGB with some data augmentatation for testing

In [13]:
import cairosvg

def create_template(image, background, pokemon_color):
    """Changes the background and color of pokemon to single colors."""
    im_array = np.array(image.convert('RGBA'))
    r, g, b, a = cv2.split(im_array) # splitting to rgba channels separately
    black_areas = ((b<=2) & (g <=2) & (r <=2))|(a<=2)
    non_black = ((b > 2) | (g > 2) | (r > 2))&(a>2)
    im_array[black_areas] = background
    revealed = Image.fromarray(im_array).copy()
    im_array[non_black] = pokemon_color
    hidden = Image.fromarray(im_array)
    return hidden, revealed

def get_PIL_image(path, file_name):
    """Saves svg as png and then loads as Pillow image"""
    cairosvg.svg2png( url=f'{path}/{file_name}', write_to='temp.png')
    return Image.open('temp.png')


def get_testing_images(path):
    """Loops through pokedex and """
    background_color = [217, 19, 17, 255]  # shade of red
    pokemon_color = [0, 0, 0, 0]  # black
    for file_name in os.listdir(path):
        if ".svg" in file_name:
            pokedex_num = int(re.findall('^(\d+)', file_name)[0])
            img = get_PIL_image(path, file_name)
            hidden, revealed = create_template(img,  background_color, pokemon_color)

            # Saving Images:
            save_image(img, 'testing_data', pokedex_num,  '# of Testing Images')
            save_image(hidden, 'testing_data', pokedex_num,   '# of Testing Images')
            save_image(hidden, './WhosThatPokemon_hidden/', pokedex_num,  '')
            save_image(revealed,'./WhosThatPokemon_revealed/', pokedex_num,  '')

get_testing_images('./test_svgs')