In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import json
import os

# Define the base URL and list of generations to scrape
base_url = 'https://www.wikidex.net/wiki/'
""" gens = ['Lista_de_Pokémon_de_la_primera_generación',
        'Lista_de_Pokémon_de_la_segunda_generación',
        'Lista_de_Pokémon_de_la_tercera_generación',
        'Lista_de_Pokémon_de_la_cuarta_generación',
        'Lista_de_Pokémon_de_la_quinta_generación',
        'Lista_de_Pokémon_de_la_sexta_generación'] """

gens = ['Lista_de_Pokémon_de_la_tercera_generación']
pokedex_url = './pokedex/'

# Set up the web driver options and service
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-blink-features=AutomationControlled')
service = Service(ChromeDriverManager().install())

In [None]:
def scrape_pokemon(driver, pkmn_name):
    # Connect to the website for the current Pokemon
    driver.get(base_url + pkmn_name)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Extract the Pokemon type information from the HTML
    pkmn_weight = soup.find("tr", attrs={"title": "Peso del Pokémon"}).td.text
    pkmn_weight = pkmn_weight.replace("kg", "").replace(",", ".").strip()

    mapAbilities = dict()

    # Find all the abilities of a Pokemon
    pkmn_abilities = soup.find("tr", attrs={"title": "Habilidades que puede conocer"}).td.findAll("a")

    # Find the hidden ability of a Pokemon
    ha_2 = soup.find("tr", attrs={"title": "Habilidad oculta"})
    is_ha = False

    if ha_2:
        ha_2 = ha_2.td.a.text
        is_ha = True

    # Print the abilities of a Pokemon
    map_abilities = {}
    for idx, ab in enumerate(pkmn_abilities):
        if len(ab.text) > 1:
            map_abilities[idx] = ab.text

    if is_ha:
        map_abilities['ha'] = ha_2
    
    
    # Extract the Pokemon ability information from the HTML
    pkmn_abilities = soup.select('tr[title^="Habilidades que puede conocer"] td a')
    ha_2 = soup.select_one('tr[title="Habilidad oculta"] td a').text if soup.select_one('tr[title="Habilidad oculta"]') else None
    num_abilities = len(pkmn_abilities) + (1 if ha_2 else 0)
    abilities = {idx: ab.text for idx, ab in enumerate(pkmn_abilities) if len(ab.text) > 1}
    if ha_2:
        abilities['ha'] = ha_2
    
    
    
    
    # Locate the table of stats (below an h2 with a specific id)
    pkmn_table_stats = soup.find("span", attrs={"id": "Caracter.C3.ADsticas_de_combate"}).parent.find_next('table')

    stats = {}
    stat_rows = pkmn_table_stats.find_all("tr")

    # Extract the stats from the table
    for idx, stat in enumerate(['hp', 'at', 'df', 'sa', 'sd', 'sp']):
        stats[stat] = int(stat_rows[idx+1].find_all("td")[0].text.strip())

    
    # Return a dictionary with the Pokemon information
    return {'weight': float(pkmn_weight), 'abilities': map_abilities, 'baseStats':stats}

In [None]:
# Define a function to save the list of dictionaries to a JSON file
def save_file(pokedex_url, gen, pkmn_list):
    with open(pokedex_url + gen + '.json', 'w', encoding="UTF-8") as fp:
        json.dump(pkmn_list, fp, indent=4, ensure_ascii=False)
        
        
def searchSprite(driver, soup, num_dex):
    numero = int(num_dex)
    print("Sprite number:", numero)
    return 'https://cdn.toast-studio.com/typedex/home/_' + str(numero) + '_regular.png'


# Set up the web driver and scrape data for each generation and Pokemon
with webdriver.Chrome(service=service, options=options) as driver:
    for gen in gens:
        numDexTemp = 0
        try:
            driver.get(base_url + gen)
            pkmn_soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            #Removing rare things
            for s in pkmn_soup.select('sup'):
                s.extract()
                
            results = pkmn_soup.select('table.tabpokemon tbody tr')
            pkmn_list = []
            for result in results:
                fields = result.select('td')
                
                numDexRow = fields[0]
                if(numDexRow.has_attr('rowspan')):
                    numDexTemp = numDexRow.text.strip()
                
                
                if( len(fields) == 4):
                    numDexRow = numDexTemp
                    pkmn_dex_num = numDexTemp
                    imageRow = fields[0]
                    nameRow = fields[1]
                    typesRow = fields[2]
                    
                    
                elif( len(fields) == 3):
                    numDexRow = numDexTemp
                    pkmn_dex_num = numDexTemp
                    imageRow = fields[0]
                    nameRow = fields[1]
                    
                    
                elif( len(fields) == 2):
                    numDexRow = numDexTemp
                    pkmn_dex_num = numDexTemp
                    imageRow = fields[0]
                    typesRow = fields[1]
                    
                    
                else:
                    imageRow = fields[1]
                    nameRow = fields[2]
                    typesRow = fields[3]
                    pkmn_dex_num = numDexRow.text.strip()
                
                pkmn_name = nameRow.select_one('a').get('title')
                print(f'{pkmn_dex_num}. {pkmn_name} ({gen})')
                
                pkmn_types = [a.get('title').replace('Tipo ', '') for a in typesRow.select('a')]
                tipo1 = pkmn_types[0]
                tipo2 = pkmn_types[1] if len(pkmn_types) > 1 else 'undefined'
                
                pkmn_sprite = searchSprite(driver, pkmn_soup, pkmn_dex_num)
                
                # Scrape the Pokemon data
                pkmn_data = scrape_pokemon(driver, pkmn_name)
                
                zeros = [0,0,0,0,0,0]
                level = 50
                
                # Create a new dictionary with the Pokemon data
                pkmn_dict = {'numDex': pkmn_dex_num, 'species': pkmn_name, 'type1': tipo1,
                             'type2': tipo2, 'ivs': zeros, 'evs': zeros, 'level': level,
                             'sprite': pkmn_sprite}
                pkmn_dict.update(pkmn_data)
                
                # Add the Pokemon dictionary to the pkmn_list
                pkmn_list.append(pkmn_dict)
                
                save_file(pokedex_url, gen, pkmn_list)
            print(f'Successfully saved data for {gen}.')
        except Exception as err:
            print(f'Error: {err}')

# Print a message indicating that the script has finished
print('Script finished.')