In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from time import sleep
from random import randint

In [2]:
# Constants
BASE_URL = "https://www.vgchartz.com"
SEARCH_URL = "https://www.vgchartz.com/games/games.php"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [3]:
# Scrapes the main Pokemon games listing page, excluding the largely N/A vgchartz_score and user_score columns and sales columns (sales scraped from individual game urls)
def get_game_list(query='pokemon', limit=200):
    params = {
        'name': query,
        'results': limit,
        'order': 1,
        'showpublisher': 1,
        'showcriticscore': 1,
        'showreleasedate': 1,
        'showlastupdate': 1,
        'showconsole': 1}

    try:
        response = requests.get(SEARCH_URL, params=params, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        body_div = soup.find('div', {'id': 'generalBody'})
        table = body_div.find('table') if body_div else None
        if not table:
            return pd.DataFrame()

        games = []
        for row in table.find_all('tr')[1:]:
            cells = row.find_all('td')
            if len(cells) < 11:
                continue

            game_link = cells[2].find('a')
            href = game_link['href'] if game_link else ''
            full_url = href if href.startswith('http') else BASE_URL + href

            game_id = re.search(r'/game/(\d+)/', full_url)
            console_img = cells[3].find('img')
            console = console_img['src'].split('/')[-1].replace('_b.png', '')

            games.append({
                'game_id': game_id.group(1) if game_id else None,
                'title': cells[2].text.strip(),
                'console': console,
                'publisher': cells[4].text.strip(),
                'critic_score': cells[6].text.strip(),
                'total_shipped': cells[8].text.strip(),
                'release_date': cells[9].text.strip(),
                'last_update': cells[10].text.strip(),
                'game_url': full_url})

        return pd.DataFrame(games)

    except Exception as e:
        print(f"Error scraping main listing: {e}")
        return pd.DataFrame()

In [4]:
# Run and inspect
pkmn_games_df = get_game_list()
pkmn_games_df.head()

Unnamed: 0,game_id,title,console,publisher,critic_score,total_shipped,release_date,last_update,game_url
0,226034,Pokemon,Series,Nintendo,,485.31m,28th Sep 98,03rd Feb 20,https://www.vgchartz.com/game/226034/pokemon/?...
1,181292,Pokémon: Ultra Sun and Ultra Moon,3DS,Nintendo,8.1,9.23m,17th Nov 17,03rd Aug 18,https://www.vgchartz.com/game/181292/pokemon-u...
2,4030,Pokémon Red / Green / Blue Version,GB,Nintendo,9.4,31.38m,30th Sep 98,,https://www.vgchartz.com/game/4030/pokemon-red...
3,71449,Pokémon X/Y Read the review,3DS,Nintendo,8.9,16.76m,12th Oct 13,06th Jan 18,https://www.vgchartz.com/game/71449/pokemon-xy...
4,81906,Pokémon Omega Ruby/Pokémon Alpha Sapphire,3DS,Nintendo,,14.63m,21st Nov 14,03rd Aug 18,https://www.vgchartz.com/game/81906/pokemon-om...


In [5]:
# Scrapes detailed sales data from individual game page
def fetch_sales_data(game_url):
    try:
        sales_url = game_url.rstrip('/') + "/sales"
        res = requests.get(sales_url, headers=HEADERS)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'html.parser')
    