In [1]:
import requests
import urllib
from bs4 import BeautifulSoup, element
import pandas as pd
import numpy as np
import sqlite3
from http.client import IncompleteRead

In [2]:
class VideoGameScraper:
    def __init__(self, url1, url2, start_page, end_page):
        self.url1 = url1
        self.url2 = url2
        self.start_page = start_page
        self.end_page = end_page
        self.session = requests.Session()
        self.failed_game_links = []  # List error games

    def scrape(self):
        videogame = []
        platform = []
        editor = []
        developer = []
        sales_na = []
        sales_eu = []
        sales_jp = []
        sales_others = []
        sales_tot = []
        release_date = []
        genre = []

        for page in range(self.start_page, self.end_page + 1):
            try:
                surl = self.url1 + str(page) + self.url2
                response = self.session.get(surl)
                soup = BeautifulSoup(response.text, "html.parser")

                videogame_tag = list(filter(lambda x: 'href' in x.attrs and x.attrs['href'].startswith('https://www.vgchartz.com/game/'),soup.find_all("a")))

                for tag in videogame_tag:
                    videogame_name = tag.contents[0][:-4]
                    print("Videogame Name:", videogame_name)
                    videogame.append(videogame_name)

                    data = tag.parent.parent.find_all("td")
                    platform.append(data[3].find("img").attrs["alt"])
                    editor.append(data[4].string)
                    developer.append(data[5].string)
                    sales_na.append(float(data[7].string[:-1]) if not data[7].string.startswith("N/A") else np.nan)
                    sales_eu.append(float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan)
                    sales_jp.append(float(data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan)
                    sales_others.append(float(data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan)
                    sales_tot.append(float(data[6].string[:-1]) if not data[6].string.startswith("N/A") else np.nan)

                    tag_date_element = data[11].string
                    release_date_value = tag_date_element.strip() if tag_date_element else None
                    release_date.append(release_date_value)

                    genre_url = tag.attrs['href']
                    attempts = 0
                    while attempts < 3:  
                        try:
                            genre_link = urllib.request.urlopen(genre_url).read()
                            genre_soup = BeautifulSoup(genre_link, "html.parser")

                            h2s = genre_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')

                            genre_tag = element.Tag
                            for h2 in h2s:
                                if h2.string == 'Genre':
                                    genre_tag = h2
                                    
                            genre.append(genre_tag.next_sibling.string)
                            break  
                        except (IncompleteRead, ValueError) as e:
                            attempts += 1
                            print(f"Errore durante la lettura della risposta HTTP: {e}. Riprovo ({attempts}/3)")
                            if attempts == 3:
                                print(f"Impossibile recuperare i dati per il gioco: {videogame_name}")
                                self.failed_game_links.append(genre_url)  
                                break  
                            continue  

            except Exception as e:
                print(f"Errore durante lo scraping: {e}")

        df = pd.DataFrame({
            "videogame": videogame,
            "platform": platform,
            "editor": editor,
            "developer": developer,
            "sales_na": sales_na,
            "sales_eu": sales_eu,
            "sales_jp": sales_jp,
            "sales_otras": sales_others,
            "sales_tot": sales_tot,
            "release": release_date,
            "genre": genre
        })

        return df, self.failed_game_links

if __name__ == "__main__":
    
    url1 = "https://www.vgchartz.com/games/games.php?page="
    url2 = '&results=1000&order=Sales&ownership=Both&direction=DESC&showtotalsales=1' 
    url2 += '&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=1'
    url2 += '&showreleasedate=1&showlastupdate=0&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=0'

scraper = VideoGameScraper(url1, url2, start_page=1, end_page=65) # 65 pages if 1000 entries are displayed in each page.

In [None]:
df = scraper.scrape()

In [None]:
df_data = df[0]

# New DataFrame with data
new_df = pd.DataFrame(df_data)

print(new_df)

In [None]:
pd.DataFrame(new_df)

In [None]:
new_df.to_csv('vgchartz_sales.csv', index=False)