In [179]:
import pandas as pd
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
import json

# LimitlessVGC URLs

In [3]:
# Follow with number e.g. .../events/23. Latest event is number 296, or just .../events/
events_url = "https://limitlessvgc.com/events/"

# Follow by a number for specific player; unsure of order, maybe alphabetic?
# Can filter by points, money, or top-8s:
#   - .../players/?rank=points
#   - .../players/?rank=money
#   - .../players/?rank=cuts
players_url = "https://limitlessvgc.com/players/"

# Follow by a Pokemon name for specific Pokemon
# Can filter by points, money, or top-8s:
#   - .../pokemon/?rank=points
#   - .../pokemon/?rank=money
#   - .../pokemon/?rank=cuts
# Other filters/url to list every Pokemon ever used in VGC:
#   https://limitlessvgc.com/pokemon/?time=all&type=all&region=all&format=all&rank=pts
# Individual pages list most common partners, moves, natures, abilities, held items, latest results,
# and most successful players
pokemon_url = "https://limitlessvgc.com/pokemon/"

# In reverse chronological order
# Same filters as Pokemon
# This is what we'll be using most
teams_url = "https://limitlessvgc.com/teams/"

# Extra parameters:
# ?pg=# where # is page number
# ?show=# where # is number of items per page or 'all' to show everything

# Pikalytics URLs

This one is more complicated because Pikalytics doesn't have neat pages for its data, we'll have to scrape HTML elements instead.

# PokemonDB URLs

In [4]:
# Followed by name OR "all" to get a list of Pokemon names with stats
pokemondb_pokedex_url = "https://pokemondb.net/pokedex/"

# Followed by name OR "all" to get a list of move names with details
# Additional arguments to ".../move/all":
# - #cat=physical/special/status
# - #type=type_name
pokemondb_move_url = "https://pokemondb.net/move/"

# Followed by name OR "all" to get a list of item names with details
# Additional arguments to ".../item/all":
# - #cat=battle/berries/general/hold/machines/medicine/pokeballs/unknown
pokemondb_item_url = "https://pokemondb.net/item/"

# PokeAPI endpoints

In [12]:
pokeapi_pokemon_species_endpoint = "https://pokeapi.co/api/v2/pokemon-species/"
pokeapi_pokemon_endpoint = "https://pokeapi.co/api/v2/pokemon/"
pokeapi_berry_endpoint = "https://pokeapi.co/api/v2/berry/"
pokeapi_item_endpoint = "https://pokeapi.co/api/v2/item/"
pokeapi_move_endpoint = "https://pokeapi.co/api/v2/move/"

# Useful functions

In [5]:
def save_html (html, path):
    with open(path, 'wb') as f:
        f.write(html)

In [6]:
def open_html (path):
    with open(path, 'rb') as f:
        return f.read()

# Scraping

## Teams data

What teams were used in VGC 2023 Series 1 events? Teams data is split into events, and each event has its own ranking system; we should incorporate that information into out data somehow.

In [140]:
teams_req = requests.get(teams_url + "?format=vgc23s1&show=all")
teams_soup = BeautifulSoup(teams_req.content, "html.parser")
rows = teams_soup.select("tr")
rows

[<tr><th>Place</th><th>Team</th><th>Player</th></tr>,
 <tr><th class="table-sub-heading" colspan="3"><a href="/events/297">20th January 2023 - Regional Liverpool, Great Britain</a></th></tr>,
 <tr><td><a href="/teams/284">1st</a></td><td><a class="team-link" href="/teams/284"><div class="vgc-team"><span class="tt" data-placement="top" data-toggle="tooltip" title="Pawmot"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/pawmot.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Gholdengo"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/gholdengo.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Dragonite"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/dragonite.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Tatsugiri"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/tatsugiri.png"/></span><span class="tt" data-placement="top" data-

Looks like the column labels are part of the `tbody` tag in this table, as shown by inspecting page HTML, so we'll drop the first row.

In [209]:
rows = rows[1:]
rows

[<tr><td><a href="/teams/285">2nd</a></td><td><a class="team-link" href="/teams/285"><div class="vgc-team"><span class="tt" data-placement="top" data-toggle="tooltip" title="Annihilape"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/annihilape.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Torkoal"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/torkoal.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Armarouge"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/armarouge.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Female Indeedee"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/indeedee-f.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip" title="Hatterene"><img class="pokemon-icon-gen9" src="/wp-content/media/icons/gen9/hatterene.png"/></span><span class="tt" data-placement="top" data-toggle="tooltip

Now we can begin extract team information from these rows. Note that we are not going to extract the Pokemon names from this page, because we will extract more detailed information about each Pokemon from the team's individual page soon.

In [142]:
teams_data = []
event = '0'

for row in rows:
    if row.find("th", attrs={"class": "table-sub-heading"}):
        event = row.select_one("tr th.table-sub-heading a")["href"][-3:]
        continue
    d = dict()
    d["event"] = event
    d["team_id"] = row.select_one("tr td a:nth-of-type(1)")["href"].split("/")[2]
    d["rank"] = int(row.select_one("tr td a:nth-of-type(1)").text.strip()[:-2])
    teams_data.append(d)

In [211]:
teams_data

[{'event': '297', 'team_id': '284', 'rank': 1},
 {'event': '297', 'team_id': '285', 'rank': 2},
 {'event': '297', 'team_id': '286', 'rank': 3},
 {'event': '297', 'team_id': '287', 'rank': 4},
 {'event': '297', 'team_id': '288', 'rank': 5},
 {'event': '297', 'team_id': '289', 'rank': 6},
 {'event': '297', 'team_id': '290', 'rank': 7},
 {'event': '297', 'team_id': '291', 'rank': 8},
 {'event': '297', 'team_id': '292', 'rank': 9},
 {'event': '297', 'team_id': '293', 'rank': 10},
 {'event': '297', 'team_id': '294', 'rank': 11},
 {'event': '297', 'team_id': '295', 'rank': 12},
 {'event': '297', 'team_id': '296', 'rank': 13},
 {'event': '297', 'team_id': '297', 'rank': 14},
 {'event': '297', 'team_id': '298', 'rank': 15},
 {'event': '297', 'team_id': '299', 'rank': 16},
 {'event': '297', 'team_id': '300', 'rank': 17},
 {'event': '297', 'team_id': '301', 'rank': 18},
 {'event': '297', 'team_id': '302', 'rank': 19},
 {'event': '297', 'team_id': '303', 'rank': 20},
 {'event': '297', 'team_id': 

In [212]:
with open("./data/vgc2023s1/teams-data.json", "w") as f:
    json.dump(teams_data, f)

Now we just add Pokemon info. We really want to know more than just the names, though, which is why we didn't include Pokemon information in the original teams data scrape. Specifically, we want to know their
- type(s),
- Tera type,
- held item,
- nature,
- base stats, and
- moves.

To integrate some of those properties into our dataframe, we're going to have to go to each team's individual page and scrape it. A team's individual page can be found at `https://limitlessvgc.com/teams/{team_id}`.

Normally we would have to scrape the individual team page, but luckily there is a hidden `textarea` with `id` "export0" that contains the information for every Pokemon on the team in Pokemon Showdown export format, which is considered the standard format for competitive Pokemon infomration. For this reason, we will use the Showdown Pokedex JSON file we downloaded to find each Pokemon's types and base stats easily.

To make things easier we'll start by extracting information just for team 284.

In [119]:
team_284_req = requests.get(teams_url + "284/")

In [120]:
team_284_soup = BeautifulSoup(team_284_req.content, "html.parser")
team_284_export = team_284_soup.select_one("textarea#export0").text.strip()
team_284_export

'Dondozo @ Leftovers\r\nAbility: Unaware\r\nTera Type: Dragon\r\nEVs: 4 HP / 252 Atk / 4 Def / 36 SpD / 212 Spe\r\nJolly Nature\r\n- Wave Crash\r\n- Substitute\r\n- Order Up\r\n- Protect\r\n\r\nTatsugiri @ Choice Scarf\r\nAbility: Commander\r\nTera Type: Water\r\nEVs: 100 Def / 172 SpA / 236 Spe\r\nIVs: 0 Atk\r\nModest Nature\r\n- Draco Meteor\r\n- Muddy Water\r\n- Icy Wind\r\n- Sleep Talk\r\n\r\nGholdengo @ Choice Specs\r\nAbility: Good as Gold\r\nTera Type: Steel\r\nEVs: 68 HP / 52 Def / 132 SpA / 4 SpD / 252 Spe\r\nIVs: 0 Atk\r\nModest Nature\r\n- Make It Rain\r\n- Shadow Ball\r\n- Thunderbolt\r\n- Power Gem\r\n\r\nMurkrow @ Eviolite\r\nAbility: Prankster\r\nTera Type: Ghost\r\nEVs: 252 HP / 44 Def / 212 SpD\r\nIVs: 0 Atk\r\nCalm Nature\r\n- Tailwind\r\n- Foul Play\r\n- Quash\r\n- Haze\r\n\r\nDragonite @ Assault Vest\r\nAbility: Inner Focus\r\nTera Type: Flying\r\nEVs: 196 HP / 252 Atk / 4 Def / 4 SpD / 52 Spe\r\nAdamant Nature\r\n- Extreme Speed\r\n- Stomping Tantrum\r\n- Tera Blas

In [121]:
team_284_info = team_284_export.split("\r\n\r\n")
team_284_info

['Dondozo @ Leftovers\r\nAbility: Unaware\r\nTera Type: Dragon\r\nEVs: 4 HP / 252 Atk / 4 Def / 36 SpD / 212 Spe\r\nJolly Nature\r\n- Wave Crash\r\n- Substitute\r\n- Order Up\r\n- Protect',
 'Tatsugiri @ Choice Scarf\r\nAbility: Commander\r\nTera Type: Water\r\nEVs: 100 Def / 172 SpA / 236 Spe\r\nIVs: 0 Atk\r\nModest Nature\r\n- Draco Meteor\r\n- Muddy Water\r\n- Icy Wind\r\n- Sleep Talk',
 'Gholdengo @ Choice Specs\r\nAbility: Good as Gold\r\nTera Type: Steel\r\nEVs: 68 HP / 52 Def / 132 SpA / 4 SpD / 252 Spe\r\nIVs: 0 Atk\r\nModest Nature\r\n- Make It Rain\r\n- Shadow Ball\r\n- Thunderbolt\r\n- Power Gem',
 'Murkrow @ Eviolite\r\nAbility: Prankster\r\nTera Type: Ghost\r\nEVs: 252 HP / 44 Def / 212 SpD\r\nIVs: 0 Atk\r\nCalm Nature\r\n- Tailwind\r\n- Foul Play\r\n- Quash\r\n- Haze',
 'Dragonite @ Assault Vest\r\nAbility: Inner Focus\r\nTera Type: Flying\r\nEVs: 196 HP / 252 Atk / 4 Def / 4 SpD / 52 Spe\r\nAdamant Nature\r\n- Extreme Speed\r\n- Stomping Tantrum\r\n- Tera Blast\r\n- Low 

First, however, we need to account for a potential problems: cosmetic forms. Some Pokemon, like Gastrodon, come in variants (Gastrodon-East and Gastrodon-West) which have exactly the same stats, and are therefore lumped under the same entry in the Showdown Pokedex. These Pokemon do, however, appear in the `cosmeticFormes` list in their base entries, so we will create a dictionary that uses the cosmetic form name as the key and the base entry name as the corresponding value. Every form name is listed the base entry's `formeOrder` list, which makes the job easy.

Note that the use of "Forme" is not a typo; Pokemon uses the word "form" as an improper noun to refer to variants and "Forme" as the proper noun that appears in the form name, e.g. "Giratina Origin Forme."

In [241]:
cosmetic_form_dict = dict()

with open("../showdown-data/pokedex.json", "r") as f:
   showdown_pokedex = json.load(f)

for (name, entry) in showdown_pokedex.items():
    try:
        if "cosmeticFormes" in entry:
          for form in entry["cosmeticFormes"]:
              cosmetic_form_dict[form] = name
    except:
        continue

In [242]:
cosmetic_form_dict

{'Unown-B': 'unown',
 'Unown-C': 'unown',
 'Unown-D': 'unown',
 'Unown-E': 'unown',
 'Unown-F': 'unown',
 'Unown-G': 'unown',
 'Unown-H': 'unown',
 'Unown-I': 'unown',
 'Unown-J': 'unown',
 'Unown-K': 'unown',
 'Unown-L': 'unown',
 'Unown-M': 'unown',
 'Unown-N': 'unown',
 'Unown-O': 'unown',
 'Unown-P': 'unown',
 'Unown-Q': 'unown',
 'Unown-R': 'unown',
 'Unown-S': 'unown',
 'Unown-T': 'unown',
 'Unown-U': 'unown',
 'Unown-V': 'unown',
 'Unown-W': 'unown',
 'Unown-X': 'unown',
 'Unown-Y': 'unown',
 'Unown-Z': 'unown',
 'Unown-Exclamation': 'unown',
 'Unown-Question': 'unown',
 'Burmy-Sandy': 'burmy',
 'Burmy-Trash': 'burmy',
 'Shellos-East': 'shellos',
 'Gastrodon-East': 'gastrodon',
 'Deerling-Summer': 'deerling',
 'Deerling-Autumn': 'deerling',
 'Deerling-Winter': 'deerling',
 'Sawsbuck-Summer': 'sawsbuck',
 'Sawsbuck-Autumn': 'sawsbuck',
 'Sawsbuck-Winter': 'sawsbuck',
 'Vivillon-Archipelago': 'vivillon',
 'Vivillon-Continental': 'vivillon',
 'Vivillon-Elegant': 'vivillon',
 'Vivil

Okay, now, in theory, every Pokemon name we get from the LimitlessVGC's individual team page's Showdown format export should be accounted for in the Showdown Pokedex JSON, either by converting the name directly to its likely entry (the preferred way since we don't want to iterate over the Pokedex twi for every Pokemon) or, if that fails, by querying the `cosmetic_forms_dict`.

Now, let's create team sheet containing the details of every Pokemon in team 284.

In [263]:
team_sheet = []

pattern = re.compile('[\W_]+')

for pokemon in team_284_info:

    d = dict()
    
    pokemon_info = pokemon.split("\r\n")

    d["species"] = pokemon_info[0].split("@")[0].strip()
    d["showdown_pokedex_name"] = pattern.sub("", d["species"].lower())
    d["item"] = "-".join(pokemon_info[0].split("@")[1].strip().lower().split()) if len(pokemon_info[0].split("@")) == 2 else None
    d["ability"] = "-".join(pokemon_info[1].split(":")[1].strip().lower().split())
    d["tera_type"] = pokemon_info[2].split(":")[1].strip().lower()
    d["nature"] = pokemon_info[-5].split()[0].strip().lower()
    d["move_1"] = "-".join(pokemon_info[-4][2:].strip().lower().split())
    d["move_2"] = "-".join(pokemon_info[-3][2:].strip().lower().split())
    d["move_3"] = "-".join(pokemon_info[-2][2:].strip().lower().split())
    d["move_4"] = "-".join(pokemon_info[-1][2:].strip().lower().split())

    team_sheet.append(d)

In [264]:
team_sheet

[{'species': 'Dondozo',
  'showdown_pokedex_name': 'dondozo',
  'item': 'leftovers',
  'ability': 'unaware',
  'tera_type': 'dragon',
  'nature': 'jolly',
  'move_1': 'wave-crash',
  'move_2': 'substitute',
  'move_3': 'order-up',
  'move_4': 'protect'},
 {'species': 'Tatsugiri',
  'showdown_pokedex_name': 'tatsugiri',
  'item': 'choice-scarf',
  'ability': 'commander',
  'tera_type': 'water',
  'nature': 'modest',
  'move_1': 'draco-meteor',
  'move_2': 'muddy-water',
  'move_3': 'icy-wind',
  'move_4': 'sleep-talk'},
 {'species': 'Gholdengo',
  'showdown_pokedex_name': 'gholdengo',
  'item': 'choice-specs',
  'ability': 'good-as-gold',
  'tera_type': 'steel',
  'nature': 'modest',
  'move_1': 'make-it-rain',
  'move_2': 'shadow-ball',
  'move_3': 'thunderbolt',
  'move_4': 'power-gem'},
 {'species': 'Murkrow',
  'showdown_pokedex_name': 'murkrow',
  'item': 'eviolite',
  'ability': 'prankster',
  'tera_type': 'ghost',
  'nature': 'calm',
  'move_1': 'tailwind',
  'move_2': 'foul-play

Now we query the Showdown Pokedex for each Pokemon, using the entry name we generated earlier.

In [254]:
with open("../showdown-data/pokedex.json", "r") as f:
   showdown_pokedex = json.load(f)

for team_member in team_sheet:
      
    showdown_pokedex_entry = showdown_pokedex[team_member["showdown_pokedex_name"]]

    team_member["type_1"] = showdown_pokedex_entry["types"][0].lower()
    team_member["type_2"] = showdown_pokedex_entry["types"][1].lower() if len(showdown_pokedex_entry["types"]) > 1 else None
    team_member["base_hp"] = showdown_pokedex_entry["baseStats"]["hp"]
    team_member["base_atk"] = showdown_pokedex_entry["baseStats"]["atk"]
    team_member["base_def"] = showdown_pokedex_entry["baseStats"]["def"]
    team_member["base_spa"] = showdown_pokedex_entry["baseStats"]["spa"]
    team_member["base_spd"] = showdown_pokedex_entry["baseStats"]["spd"]
    team_member["base_spe"] = showdown_pokedex_entry["baseStats"]["spe"]

In [255]:
team_sheet

[{'species': 'Dondozo',
  'showdown_pokedex_name': 'dondozo',
  'item': 'leftovers',
  'ability': 'unaware',
  'tera_type': 'dragon',
  'nature': 'jolly',
  'move_1': 'wave-crash',
  'move_2': 'substitute',
  'move_3': 'order-up',
  'move_4': 'protect',
  'type_1': 'water',
  'type_2': None,
  'base_hp': 150,
  'base_atk': 100,
  'base_def': 115,
  'base_spa': 65,
  'base_spd': 65,
  'base_spe': 35},
 {'species': 'Tatsugiri',
  'showdown_pokedex_name': 'tatsugiri',
  'item': 'choice-scarf',
  'ability': 'commander',
  'tera_type': 'water',
  'nature': 'modest',
  'move_1': 'draco-meteor',
  'move_2': 'muddy-water',
  'move_3': 'icy-wind',
  'move_4': 'sleep-talk',
  'type_1': 'dragon',
  'type_2': 'water',
  'base_hp': 68,
  'base_atk': 50,
  'base_def': 60,
  'base_spa': 120,
  'base_spd': 95,
  'base_spe': 82},
 {'species': 'Gholdengo',
  'showdown_pokedex_name': 'gholdengo',
  'item': 'choice-specs',
  'ability': 'good-as-gold',
  'tera_type': 'steel',
  'nature': 'modest',
  'move_

Exactly what we wanted! Now we need to do the same thing for every Pokemon in every team! This time we'll also add the ID and rank of the team the Pokemon is in.

In [265]:
team_pokemon = []

pattern = re.compile("[\W_]+")

for team in teams_data:

    team_req = requests.get(teams_url + team["team_id"])
    team_soup = BeautifulSoup(team_req.content, "html.parser")
    team_export = team_soup.select_one("textarea#export0").text.strip()
    team_info = team_export.split("\r\n\r\n")

    for pokemon in team_info:

        d = dict()
    
        pokemon_info = pokemon.split("\r\n")

        d["team_id"] = team["team_id"]
        d["rank"] = team["rank"]
        
        d["species"] = pokemon_info[0].split("@")[0].strip()
        d["showdown_pokedex_name"] = pattern.sub("", d["species"].lower()) if d["species"] not in cosmetic_form_dict else cosmetic_form_dict[d["species"]]
        d["item"] = "-".join(pokemon_info[0].split("@")[1].strip().lower().split()) if len(pokemon_info[0].split("@")) == 2 else None
        d["ability"] = "-".join(pokemon_info[1].split(":")[1].strip().lower().split())
        d["tera_type"] = pokemon_info[2].split(":")[1].lower().strip()
        d["nature"] = pokemon_info[-5].split()[0].strip().lower()
        d["move_1"] = "-".join(pokemon_info[-4][2:].strip().lower().split())
        d["move_2"] = "-".join(pokemon_info[-3][2:].strip().lower().split())
        d["move_3"] = "-".join(pokemon_info[-2][2:].strip().lower().split())
        d["move_4"] = "-".join(pokemon_info[-1][2:].strip().lower().split())

        showdown_pokedex_entry = showdown_pokedex[d["showdown_pokedex_name"]]

        d["type_1"] = showdown_pokedex_entry["types"][0].lower()
        d["type_2"] = showdown_pokedex_entry["types"][1].lower() if len(showdown_pokedex_entry["types"]) > 1 else None
        d["base_hp"] = showdown_pokedex_entry["baseStats"]["hp"]
        d["base_atk"] = showdown_pokedex_entry["baseStats"]["atk"]
        d["base_def"] = showdown_pokedex_entry["baseStats"]["def"]
        d["base_spa"] = showdown_pokedex_entry["baseStats"]["spa"]
        d["base_spd"] = showdown_pokedex_entry["baseStats"]["spd"]
        d["base_spe"] = showdown_pokedex_entry["baseStats"]["spe"]

        team_pokemon.append(d)

    sleep(1)

In [266]:
team_pokemon

[{'team_id': '284',
  'rank': 1,
  'species': 'Dondozo',
  'showdown_pokedex_name': 'dondozo',
  'item': 'leftovers',
  'ability': 'unaware',
  'tera_type': 'dragon',
  'nature': 'jolly',
  'move_1': 'wave-crash',
  'move_2': 'substitute',
  'move_3': 'order-up',
  'move_4': 'protect',
  'type_1': 'water',
  'type_2': None,
  'base_hp': 150,
  'base_atk': 100,
  'base_def': 115,
  'base_spa': 65,
  'base_spd': 65,
  'base_spe': 35},
 {'team_id': '284',
  'rank': 1,
  'species': 'Tatsugiri',
  'showdown_pokedex_name': 'tatsugiri',
  'item': 'choice-scarf',
  'ability': 'commander',
  'tera_type': 'water',
  'nature': 'modest',
  'move_1': 'draco-meteor',
  'move_2': 'muddy-water',
  'move_3': 'icy-wind',
  'move_4': 'sleep-talk',
  'type_1': 'dragon',
  'type_2': 'water',
  'base_hp': 68,
  'base_atk': 50,
  'base_def': 60,
  'base_spa': 120,
  'base_spd': 95,
  'base_spe': 82},
 {'team_id': '284',
  'rank': 1,
  'species': 'Gholdengo',
  'showdown_pokedex_name': 'gholdengo',
  'item': 

Pretty good. Now we need to figure out each Pokemon's type(s) and base stats given only its `showdown_pokedex_name`.

Lovely.

In [267]:
with open("./data/vgc2023s1/team-pokemon.json", "w") as f:
    json.dump(team_pokemon, f)