In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as BS
import time
import math

In [2]:
# Initialize genre list, column name list, and empty dataframe to hold scraped data
genres = ["Action", "Action-Adventure", "Adventure", "Board Game", "Education", "Fighting", "Misc", "MMO", "Music", "Party",
          "Platform", "Puzzle", "Racing", "Role-Playing", "Sandbox", "Shooter", "Simulation", "Sports", "Strategy", 
          "Visual Novel"]

column_names = ["Game", "Publisher", "Developer", "Total Shipped", "Total Sales", "NA Sales", "PAL Sales", 
                "Japan Sales", "Other Sales", "Release Date", "Genre"]

game_data = pd.DataFrame()

# Base URL that will be modified to scrape the page of each genre
baseURL = "https://www.vgchartz.com/games/games.php?page={}&results={}&genre={}&order=ReleaseDate&ownership=Both&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=0&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1"

In [3]:
# for loop that will filter through each genre's data and scrape it
for genre in genres:
    page_counter = 1
    
    # Remove whitespace from genre so URL will work
    if " " in genre:
        genre = genre.replace(" ", "+")
        
    # Determine how many webpages there are for the current genre
    URL = baseURL.format(page_counter, 1, genre)
    response = requests.get(URL)
    soup = BS(response.text)
    temp_df = pd.read_html(str(soup.find("table")))[6]
    comma_removed = temp_df.columns[0][0].replace(",","")
    genre_entries = int(re.findall(r'\d+', comma_removed)[0])
    number_of_pages = math.ceil(genre_entries / 200)
    
    # Scrape each webpage for the current genre and concat to game_data dataframe
    while page_counter <= number_of_pages:
        URL = baseURL.format(page_counter, 200, genre)
        response = requests.get(URL)
        soup = BS(response.text)
        temp_df = pd.read_html(str(soup.find("table")))[6]
        
        if "+" in genre:
            genre = genre.replace("+", " ")
        
        temp_df['Genre'] = genre
        temp_df.drop(temp_df.columns[[0, 1, 3, 13]], axis = 1, inplace = True)
        temp_df.columns = column_names
        game_data = pd.concat([game_data, temp_df], ignore_index = True)
        
        page_counter += 1

In [4]:
# Export game_data dataframe as a csv
game_data.to_csv('../data/game_data.csv', index=False)