# Imports & Contants
---

In [65]:
# Native
import os
import csv
from urllib.request import urlopen as uOpen, Request as uReq

# Third-parties
from bs4 import BeautifulSoup as soup

In [66]:
BASE_URL  = 'https://bulbapedia.bulbagarden.net'
END_URL   = '/wiki/%3F%3F%3F_(Pok%C3%A9mon)'
CSV_NAME  = 'pokemon.csv'
CURRENT_KEYS = [
    'image_link', 
    'index', 
    'name', 
    'categories', 
    'jp_name', 
    'jp_rom_name', 
    'types', 
    'abilities', 
    'gender_ratio', 
    'catch_rate', 
    'egg_groups', 
    'hatch_time', 
    'height', 
    'weight', 
    'mega_stones'
]

# Util Functions
---

- ### Formatar uma saída.

In [67]:
def get_formatted_message(poke_info):
    category_label   = 'Categories'  if len(poke_info['categories'])  > 1 else 'Category'
    type_label       = 'Types'       if len(poke_info['types'])       > 1 else 'Type'
    ability_label    = 'Abilities'   if len(poke_info['abilities'])   > 1 else 'Ability'
    egg_group_label  = 'Egg Groups'  if len(poke_info['egg_groups'])  > 1 else 'Egg Group'
    mega_stone_label = 'Mega Stones' if len(poke_info['mega_stones']) > 1 else 'Mega Stone'
    

    return """\
########################################################################################
# Image: %s
# Index: %d
# Name: %s / %s (%s)
# %s: %s
# %s: %s
# ============================================================
# %s: \n# - %s
# ============================================================
# Gender Ratio: %s
# Catch Rate: %s
# ============================================================
# %s: %s
# Hatch Time: %s
# ============================================================
# Height: %s
# Weight: %s
# ============================================================
# %s: %s
########################################################################################
    """ % (
        poke_info['image_link'], poke_info['index'], poke_info['name'], 
        poke_info['jp_name'], poke_info['jp_rom_name'],
        
        category_label, ' / '.join(poke_info['categories']),
        type_label,     ' / '.join(poke_info['types']),
        ability_label,  ' \n# - '.join(poke_info['abilities']),
        
        ' / '.join(poke_info['gender_ratio']) if len(poke_info['gender_ratio']) > 0 else "Genderless",
        
        poke_info['catch_rate'],
        
        egg_group_label, ' / '.join(poke_info['egg_groups']),
        poke_info['hatch_time'],
        
        ' / '.join(poke_info['height']),
        ' / '.join(poke_info['weight']),
        
        mega_stone_label,
        ' / '.join(poke_info['mega_stones']) if len(poke_info['mega_stones']) > 0 else '---',
    )


---
&nbsp;
- ### Recriar arquivos dado o nome.

In [68]:
def recreate_file(file_name):
    if (os.path.exists(file_name)):
        os.remove(file_name)

    open(file_name, 'x')


- ### Escrever headers e colunas csv

In [101]:
def write_csv_headers(file_name):
    with open(file_name, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=CURRENT_KEYS)
        writer.writeheader()

- ### Format object to CSV

In [70]:
def format_to_csv(poke_info):
    return {
        'image_link':   f'=IMAGE(\"{poke_info["image_link"]}\")',
        'index':        poke_info['index'],
        'name':         poke_info['name'],
        'categories':   ' \n'.join(['- ' + p for p in poke_info['categories']]),
        'jp_name':      poke_info['jp_name'],
        'jp_rom_name':  poke_info['jp_rom_name'],
        'types':        ' \n'.join(['- ' + p for p in poke_info['types']]),
        'abilities':    ' \n'.join(['- ' + p for p in poke_info['abilities']]),
        'gender_ratio': ' \n'.join(['- ' + p for p in poke_info['gender_ratio']]),
        'catch_rate':   poke_info['catch_rate'],
        'egg_groups':   ' \n'.join(['- ' + p for p in poke_info['egg_groups']]),
        'hatch_time':   poke_info['hatch_time'],
        'height':       ' \n'.join(['- ' + p for p in poke_info['height']]),
        'weight':       ' \n'.join(['- ' + p for p in poke_info['weight']]),
        'mega_stones':  ' \n'.join(['- ' + p for p in poke_info['mega_stones']]),
    }
        

- ### Append to CSV File

In [71]:
def append_csv(poke_info):
    formatted_poke = format_to_csv(poke_info)
    keys           = formatted_poke.keys()

    with open(CSV_NAME, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writerow(formatted_poke)
        

- ### Pegar link de um pokémon

In [72]:
def get_pokemon_link(poke_name):
        return f'/wiki/{poke_name.capitalize()}_%28Pok%C3%A9mon%29'

# Core Functions
---

- ### Carregar a página destino e retornar o soup.

In [73]:
def get_poke_soup(link):
    uClient         = uReq(link, headers={'User-Agent': 'Magic Browser'})
    uCon            = uOpen(uClient)
    poke_page_html  = uCon.read()
    uCon.close()
    
    return soup(poke_page_html, 'html.parser')

---
&nbsp;
- ### Checar se é a última página a ser verificada.
    - Em alguns casos, quando tá perto do lançamento de um novo jogo e só se tem informação de um ou mais pokémons, às vezes o Bulbapédia coloca informações não oficiais com um layout diferente.

In [74]:
def check_last_page(poke_soup):
    return poke_soup.find(id='mw-content-text').table.a['href'] == "/wiki/File:BulbaShadow.png"

---
&nbsp;
- ### Pegar informações do Pokémon
    - Essa é a função core, todas as funções de extração de dados são chamadas aqui.

In [75]:
def get_poke_info(poke_soup):
    info_table = poke_soup                      \
                    .find(id='mw-content-text') \
                    .find_all('table', recursive=False)[1]

    core          = get_core_poke_info(info_table)
    types         = get_poke_types(info_table)
    abilities     = get_poke_abilities(info_table)
    gender_ratios = get_poke_gender_ratio(info_table)
    catch_rate    = get_poke_catch_ratio(info_table)
    egg_groups    = get_poke_egg_group(info_table)
    hatch_time    = get_poke_hatch_time(info_table)
    height        = get_poke_height(info_table)
    weight        = get_poke_weight(info_table)
    mega_stones   = get_poke_megastones(info_table)
    
    return {
        'image_link':   core[0],
        'index':        core[1],
        'name':         core[2],
        'categories':   core[3],
        'jp_name':      core[4],
        'jp_rom_name':  core[5],
        'types':        types,
        'abilities':    abilities,
        'gender_ratio': gender_ratios,
        'catch_rate':   catch_rate,
        'egg_groups':   egg_groups,
        'hatch_time':   hatch_time,
        'height':       height,
        'weight':       weight,
        'mega_stones':  mega_stones
    }

# Data Extraction Functions
---

- ### Pegar o link do próximo pokémon
    - Nessa parte da página, o Bulbapédia pode colocar trs a mais pra informar algo (geralmente relacionado ao Pokemon Go), por isso esse tratamento.

In [76]:
def get_next_pokemon_link(poke_soup):
    npl = poke_soup.find(id='mw-content-text').table
    try:
        npl = npl.findChildren('tr', recursive=False)[1]    \
                 .findChildren('td', recursive=False)[2]
    except IndexError:
        npl = npl.findChildren('tr', recursive=False)[0]    \
                 .findChildren('td', recursive=False)[2]
    finally:
        return npl.find('a')['href']            

---
&nbsp;
- ### Pegar informações core do pokémon:
    - Index (número na NationalDex)
    - Nome
    - Categoria
    - Nome (JP)
    - Nome (JP/Romaji)

In [100]:
def get_core_poke_info(info_table):
    base_info_container = info_table.tr.td.table.tr
    info_container      = base_info_container.td.table.tr      \
                            .find_all('td', recursive=False)
    
    category   = info_container[0].a.find_all('span')
    categories = [cat.text for cat in category]
    
    if len(categories) > 1:
        categories = [f'{cat} Pokémon' for cat in categories if cat.find('Pokémon') == -1]
    
    image_link = 'https:' + info_table.img['src']
    
    return (
        image_link,                                                  # image_link
        int(base_info_container.th.find('a').text.replace('#', '')), # index
        info_container[0].big.text,                                  # name
        categories,                                                  # category
        info_container[1].span.text,                                 # jp_name
        info_container[1].i.text                                     # jp_rom_name
    )

---
&nbsp;
- ### Pegar tipos do pokémon.

In [93]:
def get_poke_types(info_table):
    types = info_table.find_all('tr', recursive=False)[1]          \
                      .table.find('td', attrs={'style': None})     \
                      .find_all('a')
    
    return [t.text for t in types if t.text != 'Unknown']


---
&nbsp;
- ### Pegar abilidades do pokémon.

In [91]:
def get_poke_abilities(info_table):
    ability_title = info_table.find('a', attrs={'title': 'Ability'})
    abilities     = []

    for parent in ability_title.parents:        
        if parent.name == 'td':
            abilities_container = parent.table.find_all('td')
            
            for td in abilities_container:
                if not td.has_attr('style') or (not 'display: none' in td['style']):
                    hidden_ability_container = td.find('small')
                    ability                  = td.find('a').text
                    
                    if  hidden_ability_container != None:
                        ability += f' ({hidden_ability_container.text.strip()})'

                    abilities.append(ability)
            break
    
    
    return abilities

- ### Pegar gender ratios.

In [90]:
def get_poke_gender_ratio(info_table):
    gender_ratio_titles = info_table.find('a', attrs={'title': 'List of Pokémon by gender ratio'})
    ratios              = []
    
    for parent in gender_ratio_titles.parents:        
        if parent.name == 'td':
            ratios_container = parent.table.find_all('td')
            
            for td in ratios_container:
                unformatted_ratio = td.find('a')
                
                if not td.has_attr('style') and unformatted_ratio != None:
                    ratios = [r.strip() for r in td.find('a').text.split(',')]

            break
            
    return ratios

- ### Pegar catch rate

In [81]:
def get_poke_catch_ratio(info_table):
    catch_ratio_titles = info_table.find('a', attrs={'title': 'Catch rate'})
    ratio              = ""
    
    for parent in catch_ratio_titles.parents:        
        if parent.name == 'td':
            ratio = parent.table.td.text.strip()
            
            break
            
    return ratio

- ### Pegar Egg Group.

In [82]:
def get_poke_egg_group(info_table):
    egg_group_link = info_table.find('a', attrs={'title': 'Egg Group'})
    groups         = []
    
    for parent in egg_group_link.parents:
        if parent.name == 'td':
            egg_groups = parent.table.find_all('span')
            groups     = [eg.text for eg in egg_groups]
                
            break
    
    return groups

- ### Pegar Hatch Time

In [96]:
def get_poke_hatch_time(info_table):
    hatch_time_link = info_table.find('a', attrs={'title': 'Egg cycle'})

    for parent in hatch_time_link.parents:
        if parent.name == 'td':
            hatch_time           = parent.table.td.text.strip()
            hatch_time_formatted = hatch_time.split('Egg')
            
            if len(hatch_time_formatted) > 1:
                hatch_time =  f'{hatch_time_formatted[0]} (Egg not obtainable)'
                
            return hatch_time

- ### Pegar Altura

In [84]:
def get_poke_height(info_table):
    height_link = info_table.find('a', attrs={'title': 'List of Pokémon by height'})
    heights     = []
    
    for parent in height_link.parents:
        if parent.name == 'td':
            height_container = parent.table.find('tr', attrs={'style': None})
            heights          = [h.text.strip() for h in height_container.find_all('td')]
            
            break
    
    return heights

- ### Pegar Peso

In [85]:
def get_poke_weight(info_table):
    weight_link = info_table.find('a', attrs={'title': 'Weight'})
    weights     = []
    
    for parent in weight_link.parents:
        if parent.name == 'td':
            weight_container = parent.table.find('tr', attrs={'style': None})
            weights          = [w.text.strip() for w in weight_container.find_all('td')]

            break
    
    return weights

- ### MegaStone

In [86]:
def get_poke_megastones(info_table):
    mega_stone = info_table.find_all('a', attrs={'title': 'Mega Stone'})
    mega_stone = [ms for ms in mega_stone if ms['href'] != '/wiki/Mega_Stone']
    
    return [ms.text for ms in mega_stone]

# Main Cycle
---

In [88]:
recreate_file(CSV_NAME)
write_csv_headers()

next_pokemon_link = get_pokemon_link('Bulbasaur')
while next_pokemon_link != END_URL:
    poke_soup = get_poke_soup(f'{BASE_URL}{next_pokemon_link}')
    
    if check_last_page(poke_soup):
        break

    poke_info = get_poke_info(poke_soup)
    message   = get_formatted_message(poke_info)

    append_csv(poke_info)
    print(next_pokemon_link)
    print(message)

    next_pokemon_link = get_next_pokemon_link(poke_soup)

print("\n\n########## FINISHED ##########\n\n")

/wiki/Bulbasaur_%28Pok%C3%A9mon%29
########################################################################################
# Image: https://cdn.bulbagarden.net/upload/thumb/2/21/001Bulbasaur.png/250px-001Bulbasaur.png
# Index: 1
# Name: Bulbasaur / フシギダネ (Fushigidane)
# Category: Seed Pokémon
# Types: Grass / Poison
# Abilities: 
# - Overgrow 
# - Chlorophyll (Hidden Ability)
# Gender Ratio: 87.5% male / 12.5% female
# Catch Rate: 45 (11.9%)
# Egg Groups: Monster / Grass
# Hatch Time: 5140 - 5396 steps
# Height: 2'04" / 0.7 m
# Weight: 15.2 lbs. / 6.9 kg
# Mega Stone: ---
########################################################################################
    
/wiki/Ivysaur_(Pok%C3%A9mon)
########################################################################################
# Image: https://cdn.bulbagarden.net/upload/thumb/7/73/002Ivysaur.png/250px-002Ivysaur.png
# Index: 2
# Name: Ivysaur / フシギソウ (Fushigisou)
# Category: Seed Pokémon
# Types: Grass / Poison
# Abilities: 
# - Ov

/wiki/Caterpie_(Pok%C3%A9mon)
########################################################################################
# Image: https://cdn.bulbagarden.net/upload/thumb/5/5d/010Caterpie.png/250px-010Caterpie.png
# Index: 10
# Name: Caterpie / キャタピー (Caterpie)
# Category: Worm Pokémon
# Type: Bug
# Abilities: 
# - Shield Dust 
# - Run Away (Hidden Ability)
# Gender Ratio: 50% male / 50% female
# Catch Rate: 255 (43.9%)
# Egg Group: Bug
# Hatch Time: 3855 - 4111 steps
# Height: 1'00" / 0.3 m
# Weight: 6.4 lbs. / 2.9 kg
# Mega Stone: ---
########################################################################################
    
/wiki/Metapod_(Pok%C3%A9mon)
########################################################################################
# Image: https://cdn.bulbagarden.net/upload/thumb/c/cd/011Metapod.png/250px-011Metapod.png
# Index: 11
# Name: Metapod / トランセル (Transel)
# Category: Cocoon Pokémon
# Type: Bug
# Ability: 
# - Shed Skin
# Gender Ratio: 50% male / 50% female
# Catch

KeyboardInterrupt: 

# Teste
---

In [47]:
def tst_pok(poke_name, backup=False):
    next_pokemon_link = get_pokemon_link(poke_name)
    poke_soup         = get_poke_soup(f'{BASE_URL}{next_pokemon_link}')
    poke_info         = get_poke_info(poke_soup)
    
    if backup:
        with open('a.html', 'w') as f:
            f.write(poke_soup.prettify())

    print(get_formatted_message(poke_info))

In [98]:
tst_pok('ponyta', True)

########################################################################################
# Image: https://cdn.bulbagarden.net/upload/thumb/3/3b/077Ponyta.png/250px-077Ponyta.png
# Index: 77
# Name: Ponyta / ポニータ (Ponyta)
# Categories: Fire Horse Pokémon / Unique Horn Pokémon
# Type: Fire
# Abilities: 
# - Run Away (Ponyta) 
# - Run Away (Galarian Ponyta) 
# - Flame Body (Ponyta Hidden Ability)
# Gender Ratio: 50% male / 50% female
# Catch Rate: 190 (35.2%)
# Egg Group: Field
# Hatch Time: 5140 - 5396 steps
# Height: 3'03" / 1.0 m
# Weight: 66.1 lbs. / 30.0 kg
# Mega Stone: ---
########################################################################################
    
