In [1]:
# Imports used for the code
import requests as req
from bs4 import BeautifulSoup as bs
import json
import time as t


In [3]:
# List of genres, the scraping goes genre by genre, which is faster then extracting the genre for each game.
list_of_genres = [
    'Music',
    'Action-Adventure',
    'Adventure',
    'Education',
    'Fighting',
    'Misc',
    'MMO',
    'Party',
    'Platform',
    'Puzzle',
    'Racing',
    'Shooter',
    'Sandbox',
    'Sports',
    'Strategy',
    'Simulation',
    'Role-Playing',
    'Action'
]

In [4]:
def save_data(data, json_name):
    """
    This function handles the update of the databse json file.
    gets:
        data - the data which will be added to the json file
        json_name - the name of the file to which the data will be saved.
        
        returns the len of the database, for viewing the progress of the script.
    """
    json_data = []
    # Loading file and extracting data
    with open(json_name, 'r') as data_json:
        json_data = json.load(data_json)
    # Appending to the existing data
    json_data.append(data)
    # Writing to a file
    with open(json_name, 'w') as new_update:
        json.dump(json_data, new_update, indent=4)
    return len(json_data)


In [5]:
def load_page(genre, page_n):
    
    """ This function will try to load the page.
    if the page loads, it will return the HTML object
    else - will retry.
    gets: 
        genre - the genre which we scrape
        page_n - the page number
        (Both of these are used to format the link below).
    returns:
        when we get a good response, we return the content of the HTML """
    
    url = f"https://www.vgchartz.com/games/games.php?page={page_n}&results=200 \
&genre={genre}&order=Sales&ownership=Both&direction=DESC&showtotalsales= \
1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher \
=0&showdeveloper=1&showreleasedate=1&showlastupdate=0& \
showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1"
    response = req.get(url)
    if response.status_code == 200 and "503 Service Unavailable" not in str(response.content):
        print(f'{response.status_code} | Connected!')
        return response.content
    else:
        print(
            f'{response.status_code} - Could not connect...Reconnecting in 15 seconds...')
        t.sleep(15)
        return load_page(genre, page_n)
    return response.content

In [7]:
def extract_items(genre, html_object):
    
"""" The function below is responsible for extracting the content of the webpage obtained from the preceding function.
Once all the games on the page have been scraped, the data is sent to a saving function for storage.
The function takes the following inputs:

Genre: Data to be included in the JSON file.
HTML_object: The HTML content obtained from the aforementioned function.

The function returns an error if the number of results on a page is less than 5.
This threshold accounts for the header and a few irrelevant div elements on the page.
If there are fewer than 5 items on a page, it indicates that the page contains no relevant
data for the given genre, signifying the completion of processing for that genre. """


    try:
        amount = 0
        games_from_page = []
        soup = bs(html_object, "html.parser")
        table_of_games = soup.find('div', id='generalBody').find_all('tr')
        if len(table_of_games) > 5:
            del table_of_games[:3]
            del table_of_games[-1]
            for game in table_of_games:
                row = game.find_all('td')
                game_record = {'name': row[2].text,
                               'developer': row[4].text,
                               'platform': row[3].find('img')['alt'],
                               'genre': genre,
                               'total_shipments': row[5].text,
                               'total_sales': row[6].text,
                               'na_sales': row[7].text,
                               'pal_sales': row[8].text,
                               'japan_sales': row[9].text,
                               'other_sales': row[10].text,
                               'release': row[11].text
                               }
                amount = save_data(game_record, "database.json")
            print(f'Collected: {amount}')
            return True
        else:
            return False
    except Exception as e:
        return False

In [None]:
# After the script is done, we can save the data using this function:
def convert_from_json_to_csv(json_file, output_name):
    json_data = pd.read_json(json_file)
    json_data.to_csv(f"{output_name}.csv")
    time.sleep(3)
    read_csv = pd.read_csv(f"{output_name}.csv")
    read_csv.dropna(thresh=2, axis=0)
    read_csv.to_csv(f'{output_name}_clean.csv')

In [None]:
# Main code, runs per each genre, and activates the code parts on each data item.
data = []
counter = 0
games_counter = 0
for genre in list_of_genres:
    for page in range(45):
        counter += 1
        data = load_page(genre, page)
        status = extract_items(genre, data)
        if not status:
            break
        else:
            games_counter += 200
            print(
                f'TOTAL: {counter}/{14*len(list_of_genres)}/ Genre:{genre}')
convert_from_json_to_csv('database.json','database.csv')

200 | Connected!
