In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import re
import unicodedata
import psutil
import time
import concurrent.futures

In [10]:
yugioh_booster_web_page = 'https://yugioh.fandom.com/wiki/Booster_Pack'

cutoff_booster_pack = 'Eternity Code' # Current cutoff that will change over time once new english booster packs come out 
                                      # or new banlist is updated
                                      # Date of update: 26 July 2020

In [11]:
# Function to scrape the specific yugioh web page that contains the urls of all the booster packs up to the current meta
# and returns them in a list format
def get_booster_urls(url, cutoff):
    yugioh_booster_source = requests.get(url)

    if yugioh_booster_source.status_code == 200:
        yugioh_booster_site_html = BeautifulSoup(yugioh_booster_source.text.encode('utf-8'), 'html')

    booster_pack_url_list = list()

    # Appending the urls to a list
    for table in yugioh_booster_site_html.find_all('table', attrs = {'class': "nowraplinks navbox-subgroup"}):
        for ul in table.find_all('ul'):
            for a in ul.find_all('a'):
                booster_pack_url_list.append('https://yugioh.fandom.com' + a['href'])
                if a['title'] == cutoff:
                    break
            if a['title'] == cutoff:
                break
        if a['title'] == cutoff:
            break

    return booster_pack_url_list

In [12]:
booster_pack_url_list = get_booster_urls(yugioh_booster_web_page, cutoff_booster_pack)
print(len(booster_pack_url_list))
booster_pack_url_list

107


['https://yugioh.fandom.com/wiki/Vol.1',
 'https://yugioh.fandom.com/wiki/Vol.2',
 'https://yugioh.fandom.com/wiki/Vol.3',
 'https://yugioh.fandom.com/wiki/Vol.4',
 'https://yugioh.fandom.com/wiki/Vol.5',
 'https://yugioh.fandom.com/wiki/Vol.6',
 'https://yugioh.fandom.com/wiki/Vol.7',
 'https://yugioh.fandom.com/wiki/Booster_1',
 'https://yugioh.fandom.com/wiki/Booster_2',
 'https://yugioh.fandom.com/wiki/Booster_3',
 'https://yugioh.fandom.com/wiki/Booster_4',
 'https://yugioh.fandom.com/wiki/Booster_5',
 'https://yugioh.fandom.com/wiki/Booster_6',
 'https://yugioh.fandom.com/wiki/Booster_7',
 'https://yugioh.fandom.com/wiki/Magic_Ruler_(Japanese)',
 'https://yugioh.fandom.com/wiki/Pharaoh%27s_Servant_(Japanese)',
 'https://yugioh.fandom.com/wiki/Curse_of_Anubis_(set)',
 'https://yugioh.fandom.com/wiki/Thousand_Eyes_Bible',
 'https://yugioh.fandom.com/wiki/Spell_of_Mask',
 'https://yugioh.fandom.com/wiki/Labyrinth_of_Nightmare_(Japanese)',
 'https://yugioh.fandom.com/wiki/Struggle_of

In [13]:
all_other_packs_url = 'https://yugioh.fandom.com/wiki/Template:Packs'

# These are denied card pack URLs that have not been updated with the latest cards from Konami
# or they are typos and have no cards in them
denied_urls = ['https://yugioh.fandom.com/wiki/Phantom_Rage_%2B1_Bonus_Pack',
               'https://yugioh.fandom.com/wiki/Rise_of_the_Duelist_%2B1_Bonus_Pack',
               'https://yugioh.fandom.com/wiki/Blazing_Vortex_%2B1_Bonus_Pack',
               'https://yugioh.fandom.com/wiki/Maximum_Gold',
               'https://yugioh.fandom.com/wiki/EX_Value_The_Gold_Box_%2B_ABYR%26CBLZ',
               'https://yugioh.fandom.com/wiki/EX_Value_The_Gold_Box_%2B_GS2013',
               'https://yugioh.fandom.com/wiki/Legendary_Duelists:_Rage_of_Ra',
               'https://yugioh.fandom.com/wiki/World_Premiere_Pack_2020',
               'https://yugioh.fandom.com/wiki/Dragons_of_Legend:_The_Complete_Series',
               'https://yugioh.fandom.com/wiki/Deck_Build_Pack:_Genesis_Impactors']

In [14]:
# Function is used specifically for card packs that are not booster packs like duelist packs, event packs to get the urls
# of all these other card packs from the Original Template web page containing everything about card packs
def get_other_cardpacks_url(url, denied_urls):
    all_other_packs_source = requests.get(url)
    
    if all_other_packs_source.status_code == 200:
        all_other_packs_html = BeautifulSoup(all_other_packs_source.text.encode('utf-8'), 'html')
        
    card_packs_url_list = list()
    for li in all_other_packs_html.find('table', attrs = {'class': "navbox"}).find_all('li'):
        for a in li.find_all('a'):
            card_packs_url_list.append('https://yugioh.fandom.com' + a['href'])
    card_packs_url_list.pop(0)        
    card_packs_url_list = list(set(card_packs_url_list))
    
    card_packs_url_list = [link for link in card_packs_url_list if link not in denied_urls]
    
    return card_packs_url_list

In [15]:
other_card_packs_url_list = get_other_cardpacks_url(all_other_packs_url, denied_urls)

In [16]:
# Function to filter the true card pack URLs from the URLs that have no table of cards in them
# or just random hyperlinks in general
def card_pack_url_filter(url, index):
    try:
        pd.read_html(url, attrs = {'class': 'wikitable'})
    except ValueError: 
        return index

In [17]:
time1 = time.perf_counter()

if __name__ == '__main__':
    # Multi-Threading to speed up the webscraping and filtering the actual card pack URLs
    with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
        popped_indexes_iterable = executor.map(card_pack_url_filter, other_card_packs_url_list, range(len(other_card_packs_url_list)))
    print(f'{time.perf_counter() - time1} seconds')
    popped_indexes = list(popped_indexes_iterable)
    popped_indexes = [i for i in popped_indexes if i != None]

for index in sorted(popped_indexes, reverse = True):
    del other_card_packs_url_list[index]

print(len(other_card_packs_url_list))
other_card_packs_url_list

170.87979810000002 seconds
276


['https://yugioh.fandom.com/wiki/Premium_Pack_Vol.14_1st_Wave',
 'https://yugioh.fandom.com/wiki/Premium_Pack_17',
 'https://yugioh.fandom.com/wiki/Destiny_Soldiers',
 'https://yugioh.fandom.com/wiki/Star_Pack_2013',
 'https://yugioh.fandom.com/wiki/Surpassing_10000_Cards_Commemoration_Special_Pack',
 'https://yugioh.fandom.com/wiki/Legendary_Collection_4:_Joey%27s_World_Mega_Pack',
 'https://yugioh.fandom.com/wiki/Speed_Duel:_Arena_of_Lost_Souls',
 'https://yugioh.fandom.com/wiki/The_Infinity_Chasers',
 'https://yugioh.fandom.com/wiki/Premium_Pack_5',
 'https://yugioh.fandom.com/wiki/Promotion_Pack_2017',
 'https://yugioh.fandom.com/wiki/Extra_Pack_2015',
 'https://yugioh.fandom.com/wiki/Legendary_Collection_Kaiba_Mega_Pack',
 'https://yugioh.fandom.com/wiki/Premium_Pack_10',
 'https://yugioh.fandom.com/wiki/20th_Secret_Rare_Special_Pack',
 'https://yugioh.fandom.com/wiki/Extra_Pack_Volume_2',
 'https://yugioh.fandom.com/wiki/20th_Secret_Rare_Challenge_Pack',
 'https://yugioh.fandom.c

In [18]:
all_card_packs_url_list = booster_pack_url_list + other_card_packs_url_list
len(all_card_packs_url_list)

383

In [116]:
#booster_pack_url = 'https://yugioh.fandom.com/wiki/LINK_VRAINS_Duelist_Set#Japanese'
#booster_pack_url = 'https://yugioh.fandom.com/wiki/Legendary_Duelists:_Season_1'
#booster_pack_url = 'https://yugioh.fandom.com/wiki/Vol.1'
#booster_pack_url = 'https://yugioh.fandom.com/wiki/Secrets_of_Eternity'
#booster_pack_url = 'https://yugioh.fandom.com/wiki/Duelist_Pack:_Kite'
booster_pack_url = 'https://yugioh.fandom.com/wiki/Collection_Pack_2020'

In [121]:
# Function that scrapes the card pack URL and return all the urls of each card from the card
# pack in a list format
def get_card_urls(card_pack_url):
    card_pack_source = requests.get(card_pack_url)

    if card_pack_source.status_code == 200:
        card_pack_html = BeautifulSoup(card_pack_source.text.encode('utf-8'), 'html.parser')

    class_or_id = 'class'
    attribute_name = "wikitable"
    # Code hacks for 2 card pack URLs
    if card_pack_url == 'https://yugioh.fandom.com/wiki/Duelist_Pack:_Kite':
        attribute_name = "sortable"
    elif card_pack_url == 'https://yugioh.fandom.com/wiki/Collection_Pack_2020':
        class_or_id = 'id'
        attribute_name = 'Top_table'
        
    
    card_url_list = list()
    
    for card_table in card_pack_html.find_all('table', attrs = {class_or_id: attribute_name}):
        # Building a table of hyperlinks because pd.read_html does not read the hyperlinks, but only reads the unlinked
        # text of tables
        record = list()
        for tr in card_table.findAll("tr"):
            ths = tr.findAll("th")
            if ths != []:
                columns = [th.text.replace('\n', '').strip() for th in ths]
            else:
                row = list()
                for td in tr.find_all('td'):
                    try:
                        row.append(td.a['href'])
                    except KeyError:
                        row.append(td.a.text)
                    except TypeError:
                        row.append(td.text)
                record.append(row)

        card_pack_df = pd.DataFrame(data = record, columns = columns)
        try:
            card_pack_df.rename(columns = {'English name': 'Card Name'}, inplace = True)
            # Raise keyword is used because KeyError was handled previously above, thus raise is
            # needed to  reraise the KeyError if it comes up again
            raise KeyError() 
        except KeyError:
            card_pack_df.rename(columns = {'Name': 'Card Name'}, inplace = True)
        finally:
            card_pack_df['Card Name'] = card_pack_df['Card Name'].apply(lambda x: 'https://yugioh.fandom.com' + x)
            card_url_list.extend(list(card_pack_df['Card Name']))
           
    
    card_url_list = list(set(card_url_list))
    return card_url_list

In [122]:
get_card_urls(booster_pack_url)

['https://yugioh.fandom.com/wiki/Fossil_Warrior_Skull_King',
 'https://yugioh.fandom.com/wiki/Weathering_Soldier',
 'https://yugioh.fandom.com/wiki/Super_All_In!',
 'https://yugioh.fandom.com/wiki/Glacial_Beast_Polar_Penguin',
 'https://yugioh.fandom.com/wiki/Glacial_Beast_Blizzard_Wolf',
 'https://yugioh.fandom.com/wiki/Fire_Flint_Lady',
 'https://yugioh.fandom.com/wiki/Fossil_Dragon_Skullgar',
 'https://yugioh.fandom.com/wiki/Fossil_Warrior_Skull_Knight',
 'https://yugioh.fandom.com/wiki/Appliancer_Reuse',
 'https://yugioh.fandom.com/wiki/Number_C1:_Numeron_Chaos_Gate_Sunya',
 'https://yugioh.fandom.com/wiki/Number_4:_Numeron_Gate_Catvari',
 'https://yugioh.fandom.com/wiki/Appliancer_Socketroll',
 'https://yugioh.fandom.com/wiki/High_Rate_Draw',
 'https://yugioh.fandom.com/wiki/Number_2:_Numeron_Gate_Dve',
 'https://yugioh.fandom.com/wiki/Numeron_Wall',
 'https://yugioh.fandom.com/wiki/Appliancer_Celtopus',
 'https://yugioh.fandom.com/wiki/Psychic_Wave',
 'https://yugioh.fandom.com/w

In [70]:
get_card_urls(booster_pack_url)

['https://yugioh.fandom.com/wiki/Galaxy_Zero',
 'https://yugioh.fandom.com/wiki/Twin_Photon_Lizard',
 'https://yugioh.fandom.com/wiki/Number_20:_Giga-Brilliant',
 'https://yugioh.fandom.com/wiki/Galaxy_Knight',
 'https://yugioh.fandom.com/wiki/Photon_Circle',
 'https://yugioh.fandom.com/wiki/Dimension_Wanderer',
 'https://yugioh.fandom.com/wiki/Galaxy-Eyes_Photon_Dragon',
 'https://yugioh.fandom.com/wiki/Photon_Pirate',
 'https://yugioh.fandom.com/wiki/Photon_Thrasher',
 'https://yugioh.fandom.com/wiki/Reverse_Buster',
 'https://yugioh.fandom.com/wiki/Number_56:_Gold_Rat',
 'https://yugioh.fandom.com/wiki/Photon_Lizard',
 'https://yugioh.fandom.com/wiki/Message_in_a_Bottle',
 'https://yugioh.fandom.com/wiki/Galaxy_Storm',
 'https://yugioh.fandom.com/wiki/Lightserpent',
 'https://yugioh.fandom.com/wiki/Photon_Leo',
 'https://yugioh.fandom.com/wiki/Photon_Slasher',
 'https://yugioh.fandom.com/wiki/Photon_Lead',
 'https://yugioh.fandom.com/wiki/Photon_Cerberus',
 'https://yugioh.fandom.co

In [9]:
len(get_card_urls(booster_pack_url))

20

In [115]:
all_card_packs_url_list[328:329]

['https://yugioh.fandom.com/wiki/Collection_Pack_2020']

In [123]:
time1 = time.perf_counter()

if __name__ == '__main__':
    # Multi-threading is used here because webscraping is an I/O bound activity, and specify max_workers
    # argument because the default value will not render the multi-threading successful
    with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
        card_url_iterable = executor.map(get_card_urls, all_card_packs_url_list)
    print(f'{time.perf_counter() - time1} seconds')

# Flattening the iterable because the iterable is a list of lists, each card pack has its own list of cards
# so to combine all card urls from different card packs, we need to flatten the list
card_url_list = [item for sublist in list(card_url_iterable) for item in sublist]

# To remove duplicate URLs, since the same card can exist in multiple different packs
card_url_list = list(set(card_url_list))
card_url_list

90.49897349999992 seconds


['https://yugioh.fandom.com/wiki/Gravekeeper%27s_Watcher',
 'https://yugioh.fandom.com/wiki/X-Saber_Pashuul',
 'https://yugioh.fandom.com/wiki/D.D._Jet_Iron',
 'https://yugioh.fandom.com/wiki/Overlay_Capture',
 'https://yugioh.fandom.com/wiki/Marauding_Captain',
 'https://yugioh.fandom.com/wiki/Fenghuang',
 'https://yugioh.fandom.com/wiki/Impcantation_Penciplume',
 'https://yugioh.fandom.com/wiki/Guard_Ghost',
 'https://yugioh.fandom.com/wiki/Tenyi_Spirit_-_Shthana',
 'https://yugioh.fandom.com/wiki/Orcust_Crescendo',
 'https://yugioh.fandom.com/wiki/Witch_Doctor_of_Sparta',
 'https://yugioh.fandom.com/wiki/Mermail_Abyssocea',
 'https://yugioh.fandom.com/wiki/Spiritual_Beast_Rampengu',
 'https://yugioh.fandom.com/wiki/Sargasso_the_D.D._Battlefield',
 'https://yugioh.fandom.com/wiki/Battlin%27_Boxer_Big_Bandage',
 'https://yugioh.fandom.com/wiki/Chronomaly_Crystal_Skull',
 'https://yugioh.fandom.com/wiki/Starship_Spy_Plane',
 'https://yugioh.fandom.com/wiki/The_Immortal_Bushi',
 'https:

In [124]:
len(card_url_list)

10176

In [44]:
next(card_url_iterable)

['https://yugioh.fandom.com/wiki/Kamionwizard',
 'https://yugioh.fandom.com/wiki/Stone_Armadiller',
 'https://yugioh.fandom.com/wiki/Machine_Conversion_Factory',
 'https://yugioh.fandom.com/wiki/Hard_Armor',
 'https://yugioh.fandom.com/wiki/The_Bewitching_Phantom_Thief',
 'https://yugioh.fandom.com/wiki/Curse_of_Dragon',
 'https://yugioh.fandom.com/wiki/Blue-Eyed_Silver_Zombie',
 'https://yugioh.fandom.com/wiki/De-Spell',
 'https://yugioh.fandom.com/wiki/Tyhone',
 'https://yugioh.fandom.com/wiki/Zombie_Warrior',
 'https://yugioh.fandom.com/wiki/Dragoness_the_Wicked_Knight',
 'https://yugioh.fandom.com/wiki/Final_Flame',
 'https://yugioh.fandom.com/wiki/M-Warrior_1',
 'https://yugioh.fandom.com/wiki/Dorover',
 'https://yugioh.fandom.com/wiki/Terra_the_Terrible',
 'https://yugioh.fandom.com/wiki/Mavelus',
 'https://yugioh.fandom.com/wiki/Karbonala_Warrior',
 'https://yugioh.fandom.com/wiki/Solitude',
 'https://yugioh.fandom.com/wiki/Kumootoko',
 'https://yugioh.fandom.com/wiki/Claw_Reach

In [27]:
set(card_url_iterable)

set()

In [29]:
len(card_url_iterable)

TypeError: object of type 'generator' has no len()