In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import os
import multiprocessing

In [2]:
base_url = 'https://bulbapedia.bulbagarden.net'
url = base_url + '/wiki/Pok%C3%A9mon_Trading_Card_Game'

# Extract series and links

In [3]:
def find_links(series):
    """
    Return all links associated with a list of series
    
    """
    all_links = []
    for serie in series:
        all_links.extend(serie.find_all('li'))
    
    return all_links

In [4]:
def extract_serie_link(links):
    """
    Find the main link for all card sets
    
    """
    sets = [l.find_all('a')[-1] for l in links]
    return {s.text: base_url + s['href'] for s in sets}

In [5]:
req_data = requests.get(url)
soup = BeautifulSoup(req_data.content)
series = soup.find_all('table')[1:3]

all_links = find_links(series)
sets = extract_serie_link(all_links)

In [6]:
sets

{'Base Set': 'https://bulbapedia.bulbagarden.net/wiki/Base_Set_(TCG)',
 'Jungle': 'https://bulbapedia.bulbagarden.net/wiki/Jungle_(TCG)',
 'Fossil': 'https://bulbapedia.bulbagarden.net/wiki/Fossil_(TCG)',
 'Base Set 2': 'https://bulbapedia.bulbagarden.net/wiki/Base_Set_2_(TCG)',
 'Team Rocket': 'https://bulbapedia.bulbagarden.net/wiki/Team_Rocket_(TCG)',
 'Gym Heroes': 'https://bulbapedia.bulbagarden.net/wiki/Gym_Heroes_(TCG)',
 'Gym Challenge': 'https://bulbapedia.bulbagarden.net/wiki/Gym_Challenge_(TCG)',
 'Neo Genesis': 'https://bulbapedia.bulbagarden.net/wiki/Neo_Genesis_(TCG)',
 'Neo Discovery': 'https://bulbapedia.bulbagarden.net/wiki/Neo_Discovery_(TCG)',
 'Neo Revelation': 'https://bulbapedia.bulbagarden.net/wiki/Neo_Revelation_(TCG)',
 'Neo Destiny': 'https://bulbapedia.bulbagarden.net/wiki/Neo_Destiny_(TCG)',
 'Legendary Collection': 'https://bulbapedia.bulbagarden.net/wiki/Legendary_Collection_(TCG)',
 'Expedition Base Set': 'https://bulbapedia.bulbagarden.net/wiki/Expeditio

In [10]:
print(sets["Cosmic Eclipse"])

https://bulbapedia.bulbagarden.net/wiki/Cosmic_Eclipse_(TCG)


Remove a couple of bad datasets

In [11]:
# for bad_set in [
#     "Legendary Collection",
#     "Base Set 2"
# ]:
#     try:
#         del sets[bad_set]
#     except:
#         pass

# Path functions

In [12]:
def get_set_path(set_name):
    return os.path.join(
        'data', 
        'pictures', 
        os.path.basename(set_name)
    )

def get_card_image_file_name(setname, image_link):
    return os.path.join(
        get_set_path(setname),
        os.path.basename(image_link)
    )

def make_dir_for_serie(set_name):
    """
    Creates a directory for storing card images   
    """
    os.makedirs(get_set_path(set_name), exist_ok=True)

# Pull in cards from series

In [13]:
def get_image_name(tag):
    image = tag.find('img')
    if image is None:
        res = tag.text
    else:
        res = image['alt']
    return res.strip()

In [14]:
def get_card_details(card_row, setname):
    """
    Extracts the details of a single card from a card row
    """
    tds = card_row.find_all('td')
    
    link_field = tds[2]
    
    res = {
        'id': tds[0].text.strip(),
        'rarity': get_image_name(tds[3])
    }
    
    
    if link_field is not None:
        res['name'] = link_field.text.strip()
        links = link_field.find_all('a')       
        link = next((l for l in links if "File:" not in l["href"]))
        if link is not None:
            res['link'] = base_url + link['href']
            res['card_file_name'] = get_card_image_file_name(setname, res['link'])
    
    res['type'] = get_image_name(card_row.find('th'))
    return(res)

In [15]:
def extract_card_details_for_set(set_name, set_link):
    """
    Generates all card links for a single series
    """
    card_soup = BeautifulSoup(requests.get(set_link).content)
    cards = pd.DataFrame()
    
    try:
        print(set_name)
        card_table = card_soup.find('table', class_ = 'multicol').find('table', width = '100%')
        card_rows = card_table.find_all('tr')[1:-1]

        make_dir_for_serie(set_name)
        cards = pd.DataFrame([get_card_details(cr, set_name) for cr in card_rows])
    finally:
        return cards
    

In [18]:
def extract_all_card_details(sets: dict):
    res = []
    for setname, link in sets.items():
        card_set = extract_card_details_for_set(setname, link)
        card_set["setname"] = setname
        res.append(card_set)
    return pd.concat(res, axis=0).reset_index(drop=True)

In [19]:
cards = extract_all_card_details(sets)

Base Set
Jungle
Fossil
Base Set 2
Team Rocket
Gym Heroes
Gym Challenge
Neo Genesis
Neo Discovery
Neo Revelation
Neo Destiny
Legendary Collection
Expedition Base Set
Aquapolis
Skyridge
Southern Islands
Sample Set
Best of Game Cards
Wizards Black Star Promos
W Promotional cards
Miscellaneous Promotional cards (TCG)/1999-2008
Crosstrainer
Unnamed Wizards Set
Jamboree
Legendary Collection 2
EX Ruby & Sapphire
EX Sandstorm
EX Dragon
EX Team Magma vs Team Aqua
EX Hidden Legends
EX FireRed & LeafGreen
EX Team Rocket Returns
EX Deoxys
EX Emerald
EX Unseen Forces
EX Delta Species
EX Legend Maker
EX Holon Phantoms
EX Crystal Guardians
EX Dragon Frontiers
EX Power Keepers
Diamond & Pearl
Mysterious Treasures
Secret Wonders
Great Encounters
Majestic Dawn
Legends Awakened
Stormfront
Platinum
Rising Rivals
Supreme Victors
Arceus
HeartGold & SoulSilver
Unleashed
Undaunted
Triumphant
Call of Legends
Black & White
Emerging Powers
Noble Victories
Next Destinies
Dark Explorers
Dragons Exalted
Dragon Vaul

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [20]:
cards

Unnamed: 0,card_file_name,id,link,name,rarity,setname,type
0,data/pictures/Base Set/Alakazam_(Base_Set_1),1/102,https://bulbapedia.bulbagarden.net/wiki/Alakaz...,Alakazam,Rare Holo,Base Set,Psychic
1,data/pictures/Base Set/Blastoise_(Base_Set_2),2/102,https://bulbapedia.bulbagarden.net/wiki/Blasto...,Blastoise,Rare Holo,Base Set,Water
2,data/pictures/Base Set/Chansey_(Base_Set_3),3/102,https://bulbapedia.bulbagarden.net/wiki/Chanse...,Chansey,Rare Holo,Base Set,Colorless
3,data/pictures/Base Set/Charizard_(Base_Set_4),4/102,https://bulbapedia.bulbagarden.net/wiki/Chariz...,Charizard,Rare Holo,Base Set,Fire
4,data/pictures/Base Set/Clefairy_(Base_Set_5),5/102,https://bulbapedia.bulbagarden.net/wiki/Clefai...,Clefairy,Rare Holo,Base Set,Colorless
...,...,...,...,...,...,...,...
9937,data/pictures/Pokémon Rumble/Lucario_(Pok%C3%A...,12/16,https://bulbapedia.bulbagarden.net/wiki/Lucari...,Lucario,—,Pokémon Rumble,Fighting
9938,data/pictures/Pokémon Rumble/Skuntank_(Pok%C3%...,13/16,https://bulbapedia.bulbagarden.net/wiki/Skunta...,Skuntank,—,Pokémon Rumble,Darkness
9939,data/pictures/Pokémon Rumble/Bastiodon_(Pok%C3...,14/16,https://bulbapedia.bulbagarden.net/wiki/Bastio...,Bastiodon,—,Pokémon Rumble,Metal
9940,data/pictures/Pokémon Rumble/Rattata_(Pok%C3%A...,15/16,https://bulbapedia.bulbagarden.net/wiki/Rattat...,Rattata,—,Pokémon Rumble,Colorless


# Data cleaning

In [21]:
def replace_value_in_column(df: pd.DataFrame, column: str, old_value, new_value):
    df[column][df[column] == old_value] = new_value

In [22]:
replace_value_in_column(cards, "type", "I", "T")
replace_value_in_column(cards, "type", "Su", "T [Su]")
replace_value_in_column(cards, "type", "St", "T [St]")

replace_value_in_column(cards, "rarity", "—", "Unknown rarity")
replace_value_in_column(cards, "rarity", "[[Image:Rarity_.png|]]", "Unknown rarity")

Store results

In [23]:
cards.to_csv("data/metadata.csv", index=False)

# Download images

## A multithreaded function to retrieve multiple urls

In [122]:
def get_card_image_url(request_result):
    """
    Retrieves the page of a card and fetches the url of the card image
    """
    soup = BeautifulSoup(request_result.content)
    try:
        global tmp
        all_images = soup.find("div", id = 'mw-content-text').find('table').find_all('a', class_ = 'image')
        all_images = [img.find("img") for img in all_images]
        tmp = all_images
        for img in all_images:
            if int(img["width"]) > 50:
                return 'http:' + img['src']
    except:
        return None
    
def save_card_image(request_result, file_name):
    if request_result is not None:
        with open(file_name, "wb") as f:
            f.write(request_result.content)
        
def get_url_if_not_none(url):
    return None if url is None else requests.get(url)

In [25]:
def get_multiple_urls(urls, processes=8):
    with multiprocessing.Pool(processes=processes) as pool:
        pool_outputs = pool.map(get_url_if_not_none, urls)
    return pool_outputs

# This will take some time

Download card pages

In [133]:
card_urls = cards["link"].tolist()
card_url_responses = get_multiple_urls(card_urls, processes=15)

Get card image links

In [None]:
card_image_links = [get_card_image_url(resp) for resp in card_url_responses]

Download card image pages

In [None]:
card_image_responses = get_multiple_urls(card_image_links, processes=15)

In [131]:
for response in card_image_responses:
    assert response is None or response.status_code == 200, "AAAAH"

In [129]:
for card_image_response, card_file_name in zip(card_image_responses, cards["card_file_name"]):
    save_card_image(card_image_response, card_file_name)