##### Notes :

- Script dispo pour scraper sans cibler : https://github.com/patiegm/vgchartzPythonScraper
- Objectif : récupérer les jeux présents dans steam-dataset pour enrichir avec données de vente

In [67]:
url = "https://www.vgchartz.com/games/games.php?name={}&"

filters = {
    'keyword': '',
    'console': '',
    'region': 'All',
    'developer': '',
    'publisher': '',
    'goty_year': '',
    'genre': '',
    'boxart': 'Both',
    'banner': 'Both',
    'ownership': 'Both',
    'showmultiplat': 'No',
    'results': '50',
    'order': 'Sales',
    'showtotalsales': '1',
    'showpublisher': '1',
    'showvgchartzscore': '1',
    'shownasales': '1',
    'showdeveloper': '1',
    'showcriticscore': '1',
    'showpalsales': '1',
    'showreleasedate': '1',
    'showuserscore': '1',
    'showjapansales': '1',
    'showlastupdate': '1',
    'showothersales': '1',
    'showshipped': '1'
}

filters = "&".join(['='.join(x) for x in filters.items()])

In [92]:
import requests

name = "World of Warcraft"
r = requests.get(url.format(name) + filters)
r

<Response [200]>

In [131]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(r.text)

# rows = soup.find_all(lambda tag: tag.name == 'tr' and 'style' in tag.attrs)
tables = soup.find_all('table')

# Keep only data rows
rows = tables[-2].find_all('tr')[3:]

In [198]:
columns = [
    "Pos",
    "Game",
    "Console",
    "Publisher",
    "Developer",
    "VGChartzScore",
    "CriticScore",
    "UserScore",
    "TotalShipped",
    "TotalSales",
    "NASales",
    "PALSales",
    "JapanSales",
    "OtherSales",
    "ReleaseDate",
    "LastUpdate"
]

print('N cols :', len(columns))
print()

game_data = rows[1]
game_data = game_data.find_all(lambda tag : tag.name == 'td')

# Ignore game picture at index 1
game_data.pop(1)

res = {}
for i, e in enumerate(game_data):    
    if not 'img' in str(game_data[i]):
        res[columns[i]] = game_data[i].text.strip()
    else:
        res[columns[i]] = game_data[i].find('img').attrs['alt']

res

N cols : 16



{'Pos': '2',
 'Game': 'World of Warcraft',
 'Console': 'PC',
 'Publisher': 'Blizzard Entertainment',
 'Developer': 'Blizzard Entertainment',
 'VGChartzScore': 'N/A',
 'CriticScore': '9.2',
 'UserScore': '8.0',
 'TotalShipped': '12.00m',
 'TotalSales': 'N/A',
 'NASales': 'N/A',
 'PALSales': 'N/A',
 'JapanSales': 'N/A',
 'OtherSales': 'N/A',
 'ReleaseDate': '23rd Nov 04',
 'LastUpdate': '22nd Apr 18'}

- Format rules:
    - "N/A" => None
    - "12.00m" => float(12.00)
    - CriticScore / UserScore => cast(float)
    - Pos => cast(int)
    - ReleaseDate / LastUpdate => datetime.parse

In [203]:
import dateutil

def parse_value(column, value):
    
    # Cast missing values
    if value == "N/A":
        return None
    
    # Filter by column
    if column == 'ReleaseDate' or column == 'LastUpdate':
        return dateutil.parser.parse(value)
    elif column == "Pos":
        return int(value)
    elif column == "CriticScore" or column == "UserScore":
        return float(value)
    elif "Sales" in column or 'Shipped' in column:
        return float(value.replace('m', ''))
    
    # If neither missing or column doesn't need processing
    else:
        return value

for k, v in res.items():
    print(parse_value(k, v))

2
World of Warcraft
PC
Blizzard Entertainment
Blizzard Entertainment
None
9.2
8.0
12.0
None
None
None
None
None
2004-11-23 00:00:00
2018-04-22 00:00:00


In [11]:
import requests
import dateutil
from bs4 import BeautifulSoup

class VGChartzScraper:
    
    def __init__(self):
        
        self.url = "https://www.vgchartz.com/games/games.php?name={}&"

        self.filters = {
            'keyword': '',
            'console': '',
            'region': 'All',
            'developer': '',
            'publisher': '',
            'goty_year': '',
            'genre': '',
            'boxart': 'Both',
            'banner': 'Both',
            'ownership': 'Both',
            'showmultiplat': 'No',
            'results': '200',
            'order': 'Sales',
            'showtotalsales': '1',
            'showpublisher': '1',
            'showvgchartzscore': '1',
            'shownasales': '1',
            'showdeveloper': '1',
            'showcriticscore': '1',
            'showpalsales': '1',
            'showreleasedate': '1',
            'showuserscore': '1',
            'showjapansales': '1',
            'showlastupdate': '1',
            'showothersales': '1',
            'showshipped': '1'
        }

        self.filters = "&".join(['='.join(x) for x in self.filters.items()])
        
        self.columns = [
            "Pos",
            "Game",
            "Console",
            "Publisher",
            "Developer",
            "VGChartzScore",
            "CriticScore",
            "UserScore",
            "TotalShipped",
            "TotalSales",
            "NASales",
            "PALSales",
            "JapanSales",
            "OtherSales",
            "ReleaseDate",
            "LastUpdate"
        ]
        
        
    def search_game(self, name):
        r = requests.get(self.url.format(name) + self.filters)
        soup = BeautifulSoup(r.text)

        # rows = soup.find_all(lambda tag: tag.name == 'tr' and 'style' in tag.attrs)
        tables = soup.find_all('table')

        # Keep only data rows
        rows = tables[-2].find_all('tr')[3:]
        
        return rows
    
    def parse_row(self, row):
        game_data = row.find_all(lambda tag : tag.name == 'td')

        # Ignore game picture at index 1
        game_data.pop(1)

        res = {}
        for i, e in enumerate(game_data):    
            if not 'img' in str(game_data[i]):
                res[self.columns[i]] = game_data[i].text.strip()
            else:
                res[self.columns[i]] = game_data[i].find('img').attrs['alt']

        return res
    
    def parse_value(self, column, value):
            
        # Cast missing values
        if value == "N/A":
            return None

        # Filter by column
        if column == 'ReleaseDate' or column == 'LastUpdate':
            return dateutil.parser.parse(value)
        elif column == "Pos":
            return int(value)
        elif column == "CriticScore" or column == "UserScore":
            return float(value)
        elif "Sales" in column or 'Shipped' in column:
            return float(value.replace('m', ''))
        elif column == 'Game':
            return value.replace('    Read the review', '')

        # If neither missing or column doesn't need processing
        else:
            return value
        
    def extract_game_data(self, name):
        
        game_data = []
        rows = self.search_game(name)
        for row in rows:
            res = self.parse_row(row)
            for k, v in res.items():
                res[k] = self.parse_value(k, v)
            game_data.append(res)
        return game_data
        
        
scraper = VGChartzScraper()
game_data = scraper.extract_game_data('Animal Crossing')
game_data

[{'Pos': 1,
  'Game': 'Animal Crossing',
  'Console': 'Series',
  'Publisher': 'Nintendo',
  'Developer': 'Nintendo',
  'VGChartzScore': None,
  'CriticScore': None,
  'UserScore': None,
  'TotalShipped': 60.67,
  'TotalSales': None,
  'NASales': None,
  'PALSales': None,
  'JapanSales': None,
  'OtherSales': None,
  'ReleaseDate': datetime.datetime(2002, 9, 16, 0, 0),
  'LastUpdate': datetime.datetime(2020, 2, 20, 0, 0)},
 {'Pos': 2,
  'Game': 'Animal Crossing: New Horizons',
  'Console': 'NS',
  'Publisher': 'Nintendo',
  'Developer': 'Nintendo',
  'VGChartzScore': '8.0',
  'CriticScore': None,
  'UserScore': None,
  'TotalShipped': 26.04,
  'TotalSales': None,
  'NASales': None,
  'PALSales': None,
  'JapanSales': None,
  'OtherSales': None,
  'ReleaseDate': datetime.datetime(2020, 3, 20, 0, 0),
  'LastUpdate': datetime.datetime(2020, 4, 11, 0, 0)},
 {'Pos': 3,
  'Game': 'Animal Crossing: New Leaf',
  'Console': '3DS',
  'Publisher': 'Nintendo',
  'Developer': 'Nintendo EAD',
  'VGC