In [None]:
from bs4 import BeautifulSoup, SoupStrainer
import cchardet
import lxml
import numpy as np
import pandas as pd
import requests
from requests.adapters import HTTPAdapter, Retry

In [None]:
def scrape_game_info(session, data_dict, pg_num: int=0) -> list[str]:
    """
    """
    ROOT = "https://www.metacritic.com"
    SELECT_CRITERIA = """
        a.title,
        .platform span.data,
        .clamp-metascore div,
        .clamp-userscore div
    """
    pg_url = ROOT + "/browse/games/score/metascore/all?page=" + str(pg_num)
    all_pg = session.get(pg_url)
    game_cells = SoupStrainer("td", class_="clamp-summary-wrap")
    games_html = BeautifulSoup(all_pg.content, "lxml", parse_only=game_cells)
    games_info = games_html.select(SELECT_CRITERIA)
    for i, game in enumerate(games_info):
        data = game.get_text(strip=True)
        match i % 4:
            case 0:
                data_dict["Title"].append(data)
            case 1:
                data_dict["Platform"].append(data)
            case 2:
                data_dict["Metascore"].append(data)
            case 3:
                data_dict["Userscore"].append(data)
    game_urls = [ROOT + game["href"] for game in games_info[::4]]
    return game_urls

In [None]:
def scrape_genres_and_date(session, data_dict, game_url: str) -> list[list[str]]:
    """
    """
    game_pg = session.get(game_url)
    body = SoupStrainer("div", class_="left")
    game_html = BeautifulSoup(game_pg.content, "lxml", parse_only=body)
    genre_info = game_html.select("li.summary_detail.product_genre .data")
    release = game_html.select_one("li.summary_detail.release_data .data")
    genre_lst = [genre.get_text() for genre in genre_info]
    data_dict["Genres"].append(genre_lst)
    data_dict["Release Date"].append(release.get_text())

In [None]:
def scrape_critic_scores(session, data_dict, game_url: str):
    """
    """
    SUFFIX = "/critic-reviews"
    SELECT_CRITERIA = "div.review_critic div.source, div.review_grade"
    critic_pg = session.get(game_url + SUFFIX)
    body = SoupStrainer("div", class_="body product_reviews")
    critic_html = BeautifulSoup(critic_pg.content, "lxml", parse_only=body)
    critic_html = critic_html.select_one("ol.reviews.critic_reviews")
    critic_info = critic_html.select(SELECT_CRITERIA)
    score_dict = {}
    for critic in (paired := iter(critic_info)):
        score = int(next(paired).get_text())
        score_dict[critic.get_text()] = score
    data_dict["Critics"].append(score_dict)

In [None]:
pg_num = 0
data_dict = {
    "Title": [],
    "Platform": [],
    "Release Date": [],
    "Metascore": [],
    "Userscore": [],
    "Genres": [],
    "Critics": [],
}
session = requests.Session()
session.headers = {"User-Agent": "Edge"}
retries = Retry(total=5, backoff_factor=1)
session.mount('http://', HTTPAdapter(max_retries=retries))
while True:
    game_urls = scrape_game_info(session, data_dict, pg_num)
    if not game_urls:
        break
    for url in game_urls:
        scrape_genres_and_date(session, data_dict, url)
        scrape_critic_scores(session, data_dict, url)
    pg_num += 1

In [None]:
df = pd.DataFrame(data_dict)
df["Genres"] = df["Genres"].apply(lambda x: list(pd.unique(x)))
df

In [None]:
df.to_csv("raw.csv", sep='|', index=False)

In [None]:
pd.read_csv("raw.csv", sep='|')