In [None]:
from __future__ import annotations

import cchardet  # Improve parsing speed
import lxml  # Replace bs4 default parser
import pickle
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

In [None]:
ROOT = "https://www.metacritic.com"


class Game:
    def __init__(
        self,
        url: str,
        title: str = "",
        metascore: int | None = None,
        userscore: int | None = None,
    ) -> None:
        self.url = url
        self.title = title
        self.metascore = metascore
        self.userscore = userscore

    def dump(self) -> tuple[str, str, int | None, int | None]:
        return (self.url, self.title, self.metascore, self.userscore)

    @staticmethod
    def build(attributes: tuple[str, str, int | None, int | None]) -> Game:
        return Game(*attributes)

    def __repr__(self) -> str:
        return f"{type(self).__name__}(url={self.url!r}, title={self.title!r}, metascore={self.metascore!r}, userscore={self.userscore})"

    def __str__(self) -> str:
        TRUNCATE_LIMIT = 13
        if len(self.title) < TRUNCATE_LIMIT:
            title_str = self.title
        else:
            title_str = f"{self.title[:TRUNCATE_LIMIT - 3]}..."
        return f"{title_str} | {self.metascore} | {self.userscore}"

In [None]:
s = requests.Session()
s.headers = {"User-Agent": "Edge"}
retries = Retry(
    total=5,
    backoff_factor=0.5,
    backoff_jitter=0.5,
    status_forcelist=[429],
    respect_retry_after_header=False,
)
s.mount("http://", HTTPAdapter(max_retries=retries))
games_sitemapindex = s.get(f"{ROOT}/games.xml")

In [None]:
last_sitemap = BeautifulSoup(games_sitemapindex.content, "xml").select_one(
    "sitemapindex sitemap:last-child"
)
if last_sitemap is None:
    raise Exception("No game sitemaps found.")
sitemap_lim = int(last_sitemap.get_text()[:-4].rpartition("/")[-1]) + 1
url_lst: list[str] = []
for i in range(1, sitemap_lim):
    sitemap = s.get(f"{ROOT}/games/{i}.xml")
    urls = BeautifulSoup(sitemap.content, "xml").find_all("loc")
    for url in urls:
        url_lst.append(url.get_text())
s.close()

In [None]:
GameCollection: list[Game] = []
for url in set(url_lst):
    GameCollection.append(Game(url[:-1].rpartition("/")[-1]))

In [None]:
with open("games.pickle", "wb") as file:
    pickle.dump([game.dump() for game in GameCollection], file)

In [None]:
with open("games.pickle", "rb") as file:
    GameCollection = [Game.build(attributes) for attributes in pickle.load(file)]

In [None]:
GameCollection