In [2]:
from selenium import webdriver
from pathlib import Path
from selenium.webdriver.common.keys import Keys

In [3]:
path = str(Path().absolute()) + '/chromedriver'

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(path, options=options)

In [4]:
# obtain each pokemon's href in order to extract each pokemon's info
url = "https://pokemon.gameinfo.io/"

driver.get(url)

href_objs = []

for i in range(9):
    href_objs = href_objs + driver.find_elements_by_xpath('//a[@class="pokemon g' + str(i) + '"]') 
    
len(href_objs)

652

In [5]:
# obtain hrefs from each href object
hrefs = [(link.get_attribute("href"), link.get_attribute("data-name"), 
                      link.get_attribute("data-gen")) for link in href_objs]

assert len(hrefs) == len(href_objs)

In [6]:
# scrape pokemon info from each href
def parse_href_object(href, name, gen):
    indiv_data = {}

    indiv_data['ID'] = href.split('/')[-1].split('-')[0]
    indiv_data['Name'] = name
    indiv_data['Generation'] = gen
    
    print(href, name, gen, indiv_data['ID'])
    
    driver.get(href)
    
    types = driver.find_elements_by_css_selector('div.type.large')
    
    indiv_data['Type 1'] = types[0].text 
    indiv_data['Type 2'] = None if len(types) < 2 else types[1].text 
    
    tables = driver.find_element_by_id('stats').find_elements_by_class_name("table-stats")
    headers = driver.find_element_by_id('stats').find_elements_by_tag_name("h3")
    headers = ['Stats'] + [header.text for header in headers]
    stats = tables[headers.index('Stats')].text.split()
    
    indiv_data['Attack'] = int(stats[1])
    indiv_data['Defense'] = int(stats[3])
    indiv_data['Stamina'] = int(stats[-1])
    indiv_data['Max CP'] = int("".join(tables[headers.index('Max CP')].text.split('\n')[-1].split()[2].split(',')))
    indiv_data['Max HP'] = int(tables[headers.index('Max HP')].text.split()[-1])
    
    size = tables[headers.index('Size')].text.split()  
    other = tables[headers.index('Other')].text.split() 
    
    indiv_data['Height (m)'] = float(size[1])
    indiv_data['Weight (kg)'] = float(size[-2])
    indiv_data['Base Capture Rate (%)'] = float(other[3].split('%')[0])
    indiv_data['Base Flee Rate (%)'] = float(other[7].split('%')[0])
    indiv_data['Distance to Candy (km)'] = int(other[-2])
    
    img = driver.find_elements_by_css_selector("a.game-image")
    indiv_data['Image URL'] = None if len(img) < 1 else img[0].get_attribute("href")
    
    sum_abt = driver.find_elements_by_class_name("description")
    
    indiv_data['Summary'] = None if len(sum_abt) < 1 else sum_abt[0].text
    indiv_data['About'] = None if len(sum_abt) < 2 else sum_abt[1].text.replace('"', "")
    indiv_data['Is Legendary'] = None if not indiv_data['Summary'] else "legendary" in indiv_data['Summary'].lower()
    
    return indiv_data

In [7]:
pokemon_data = [parse_href_object(href, name, gen) for href,name,gen in hrefs]

https://pokemon.gameinfo.io/en/pokemon/1-bulbasaur bulbasaur 1 1
https://pokemon.gameinfo.io/en/pokemon/2-ivysaur ivysaur 1 2
https://pokemon.gameinfo.io/en/pokemon/3-venusaur venusaur 1 3
https://pokemon.gameinfo.io/en/pokemon/4-charmander charmander 1 4
https://pokemon.gameinfo.io/en/pokemon/5-charmeleon charmeleon 1 5
https://pokemon.gameinfo.io/en/pokemon/6-charizard charizard 1 6
https://pokemon.gameinfo.io/en/pokemon/7-squirtle squirtle 1 7
https://pokemon.gameinfo.io/en/pokemon/8-wartortle wartortle 1 8
https://pokemon.gameinfo.io/en/pokemon/9-blastoise blastoise 1 9
https://pokemon.gameinfo.io/en/pokemon/10-caterpie caterpie 1 10
https://pokemon.gameinfo.io/en/pokemon/11-metapod metapod 1 11
https://pokemon.gameinfo.io/en/pokemon/12-butterfree butterfree 1 12
https://pokemon.gameinfo.io/en/pokemon/13-weedle weedle 1 13
https://pokemon.gameinfo.io/en/pokemon/14-kakuna kakuna 1 14
https://pokemon.gameinfo.io/en/pokemon/15-beedrill beedrill 1 15
https://pokemon.gameinfo.io/en/poke

https://pokemon.gameinfo.io/en/pokemon/130-gyarados gyarados 1 130
https://pokemon.gameinfo.io/en/pokemon/131-lapras lapras 1 131
https://pokemon.gameinfo.io/en/pokemon/132-ditto ditto 1 132
https://pokemon.gameinfo.io/en/pokemon/133-eevee eevee 1 133
https://pokemon.gameinfo.io/en/pokemon/134-vaporeon vaporeon 1 134
https://pokemon.gameinfo.io/en/pokemon/135-jolteon jolteon 1 135
https://pokemon.gameinfo.io/en/pokemon/136-flareon flareon 1 136
https://pokemon.gameinfo.io/en/pokemon/137-porygon porygon 1 137
https://pokemon.gameinfo.io/en/pokemon/138-omanyte omanyte 1 138
https://pokemon.gameinfo.io/en/pokemon/139-omastar omastar 1 139
https://pokemon.gameinfo.io/en/pokemon/140-kabuto kabuto 1 140
https://pokemon.gameinfo.io/en/pokemon/141-kabutops kabutops 1 141
https://pokemon.gameinfo.io/en/pokemon/142-aerodactyl aerodactyl 1 142
https://pokemon.gameinfo.io/en/pokemon/143-snorlax snorlax 1 143
https://pokemon.gameinfo.io/en/pokemon/144-articuno articuno 1 144
https://pokemon.gameinf

https://pokemon.gameinfo.io/en/pokemon/254-sceptile sceptile 3 254
https://pokemon.gameinfo.io/en/pokemon/255-torchic torchic 3 255
https://pokemon.gameinfo.io/en/pokemon/256-combusken combusken 3 256
https://pokemon.gameinfo.io/en/pokemon/257-blaziken blaziken 3 257
https://pokemon.gameinfo.io/en/pokemon/258-mudkip mudkip 3 258
https://pokemon.gameinfo.io/en/pokemon/259-marshtomp marshtomp 3 259
https://pokemon.gameinfo.io/en/pokemon/260-swampert swampert 3 260
https://pokemon.gameinfo.io/en/pokemon/261-poochyena poochyena 3 261
https://pokemon.gameinfo.io/en/pokemon/262-mightyena mightyena 3 262
https://pokemon.gameinfo.io/en/pokemon/263-zigzagoon zigzagoon 3 263
https://pokemon.gameinfo.io/en/pokemon/264-linoone linoone 3 264
https://pokemon.gameinfo.io/en/pokemon/265-wurmple wurmple 3 265
https://pokemon.gameinfo.io/en/pokemon/266-silcoon silcoon 3 266
https://pokemon.gameinfo.io/en/pokemon/268-cascoon cascoon 3 268
https://pokemon.gameinfo.io/en/pokemon/267-beautifly beautifly 3 2

https://pokemon.gameinfo.io/en/pokemon/382-kyogre kyogre 3 382
https://pokemon.gameinfo.io/en/pokemon/383-groudon groudon 3 383
https://pokemon.gameinfo.io/en/pokemon/384-rayquaza rayquaza 3 384
https://pokemon.gameinfo.io/en/pokemon/385-jirachi jirachi 3 385
https://pokemon.gameinfo.io/en/pokemon/386-deoxys deoxys 3 386
https://pokemon.gameinfo.io/en/pokemon/462-magnezone magnezone 4 462
https://pokemon.gameinfo.io/en/pokemon/463-lickilicky lickilicky 4 463
https://pokemon.gameinfo.io/en/pokemon/464-rhyperior rhyperior 4 464
https://pokemon.gameinfo.io/en/pokemon/440-happiny happiny 4 440
https://pokemon.gameinfo.io/en/pokemon/465-tangrowth tangrowth 4 465
https://pokemon.gameinfo.io/en/pokemon/439-mime-jr mime jr. 4 439
https://pokemon.gameinfo.io/en/pokemon/466-electivire electivire 4 466
https://pokemon.gameinfo.io/en/pokemon/467-magmortar magmortar 4 467
https://pokemon.gameinfo.io/en/pokemon/470-leafeon leafeon 4 470
https://pokemon.gameinfo.io/en/pokemon/471-glaceon glaceon 4 47

https://pokemon.gameinfo.io/en/pokemon/506-lillipup lillipup 5 506
https://pokemon.gameinfo.io/en/pokemon/507-herdier herdier 5 507
https://pokemon.gameinfo.io/en/pokemon/508-stoutland stoutland 5 508
https://pokemon.gameinfo.io/en/pokemon/509-purrloin purrloin 5 509
https://pokemon.gameinfo.io/en/pokemon/510-liepard liepard 5 510
https://pokemon.gameinfo.io/en/pokemon/511-pansage pansage 5 511
https://pokemon.gameinfo.io/en/pokemon/512-simisage simisage 5 512
https://pokemon.gameinfo.io/en/pokemon/513-pansear pansear 5 513
https://pokemon.gameinfo.io/en/pokemon/514-simisear simisear 5 514
https://pokemon.gameinfo.io/en/pokemon/515-panpour panpour 5 515
https://pokemon.gameinfo.io/en/pokemon/516-simipour simipour 5 516
https://pokemon.gameinfo.io/en/pokemon/517-munna munna 5 517
https://pokemon.gameinfo.io/en/pokemon/518-musharna musharna 5 518
https://pokemon.gameinfo.io/en/pokemon/519-pidove pidove 5 519
https://pokemon.gameinfo.io/en/pokemon/520-tranquill tranquill 5 520
https://pok

https://pokemon.gameinfo.io/en/pokemon/630-mandibuzz mandibuzz 5 630
https://pokemon.gameinfo.io/en/pokemon/631-heatmor heatmor 5 631
https://pokemon.gameinfo.io/en/pokemon/632-durant durant 5 632
https://pokemon.gameinfo.io/en/pokemon/633-deino deino 5 633
https://pokemon.gameinfo.io/en/pokemon/634-zweilous zweilous 5 634
https://pokemon.gameinfo.io/en/pokemon/635-hydreigon hydreigon 5 635
https://pokemon.gameinfo.io/en/pokemon/636-larvesta larvesta 5 636
https://pokemon.gameinfo.io/en/pokemon/637-volcarona volcarona 5 637
https://pokemon.gameinfo.io/en/pokemon/638-cobalion cobalion 5 638
https://pokemon.gameinfo.io/en/pokemon/639-terrakion terrakion 5 639
https://pokemon.gameinfo.io/en/pokemon/640-virizion virizion 5 640
https://pokemon.gameinfo.io/en/pokemon/641-tornadus tornadus 5 641
https://pokemon.gameinfo.io/en/pokemon/642-thundurus thundurus 5 642
https://pokemon.gameinfo.io/en/pokemon/643-reshiram reshiram 5 643
https://pokemon.gameinfo.io/en/pokemon/644-zekrom zekrom 5 644
h

In [8]:
assert len(pokemon_data) == len(hrefs)

In [9]:
pokemon_df = pd.DataFrame(pokemon_data)
pokemon_df.head(3)

NameError: name 'pd' is not defined

In [None]:
# check which fields contain missing values
pokemon_df.isnull().sum()

In [None]:
# check which pokemon have missing img urls
rows_with_missing_img = pokemon_df[pokemon_df['Image URL'].isnull()][['ID','Name','Image URL']]
rows_with_missing_img.head()

In [None]:
# get IDs for pokemon with missing img urls
missing_img_IDs = rows_with_missing_img['ID'].values

In [None]:
url = 'https://www.pokemon.com/us/pokedex/'

In [None]:
# obtain missing img urls from different source
def get_IMG_urls(driver, ID_list):
    img_urls = []
    
    for ID in ID_list:
        driver.get(url)
        search_bar = driver.find_element_by_id('searchInput')
        search_bar.clear()
        time.sleep(3)
        search_bar.send_keys(ID, Keys.RETURN)
        time.sleep(5)
        try:
            img_url = driver.find_element_by_css_selector('ul.results').find_element_by_tag_name('img').get_attribute('src')
            print(img_url)
            img_urls.append(img_url)
        except:
            img_urls.append(None)
    
    driver.close()
        
    return img_urls

In [None]:
assert len(img_urls) == len(missing_img_IDs)

In [None]:
pokemon_df.at[rows_with_missing_img.index,'Image URL'] = img_urls

In [None]:
pokemon_df.isnull().sum()

In [None]:
pokemon_df.head()

In [None]:
pokemon_df.info()

In [None]:
# save data scraped data to csv file
data_path = 'data/pokemon_data_full.csv'

pokemon_df.to_csv(data_path)

In [None]:
# check that it saved properly
df = pd.read_csv(data_path, header=0, index_col=0)
df.head()

In [None]:
df.info()