In [1]:
from __future__ import annotations

import cchardet
import lxml
import os
import polars as pl
import requests

from bs4 import BeautifulSoup, SoupStrainer
from lxml import etree as et
from requests.adapters import HTTPAdapter, Retry

ROOT = "https://www.metacritic.com/game"

In [2]:
def extract_url_slugs(loc_elements: list[et._Element], url_slugs: set[str]) -> None:
    """Process a single sitemap's loc tags and update the set of URL slugs.

    Args:
        loc_elements: A list of loc tags containing url text.
        url_slugs: The set in which game url slugs are to be stored to.
    """
    for loc in loc_elements:
        if (url := loc.text) is None:
            continue
        url_slug = url.rstrip("/").rpartition("/")[-1]
        url_slugs.add(url_slug)

In [3]:
def scrape_sitemaps(s: requests.Session) -> list[str]:
    """Retrieve URL slugs associated with all games from sitemaps.

    Colloquially, a slug is the unique identifying part of a web address,
    typically at the end of the URL. Each sitemap contains at most 100 urls.

    Args:
        s: Session object to fetch sitemaps over a persistent http connection.

    Returns:
        A list of url slugs corresponding to every game indexed in metacritic's sitemaps.
    """
    i = 1
    url_slugs: set[str] = set()
    while True:
        sitemap_response = s.get(f"{ROOT}s/{i}.xml")
        if not sitemap_response.ok:
            print(f"Sitemap page {i} failed to respond, skipped.")
            i += 1
            continue
        sitemap_tree = et.XML(sitemap_response.text)
        loc_elements = sitemap_tree.findall(".//{*}loc")
        if not loc_elements:
            break
        extract_url_slugs(loc_elements, url_slugs)
        i += 1

    return list(url_slugs)

In [4]:
def scrape_games(s: requests.Session, url_slugs: list[str]) -> None:
    """ """
    pass

In [7]:
def scraper(
    root_url: str,
    user_agent: str = "Edge",
    max_retries: int = 5,
) -> None:
    """ """
    with requests.Session() as s:
        s.headers = {"User-Agent": user_agent}
        retry_config = Retry(
            total=max_retries,
            backoff_factor=0.5,
            backoff_jitter=0.5,
            status_forcelist=[429],
            respect_retry_after_header=False,
        )
        s.mount("https://", HTTPAdapter(max_retries=retry_config))
        if not os.path.isfile("../data/games.tsv"):
            url_slugs = scrape_sitemaps(s)
            pl.DataFrame({"url_slugs": url_slugs}).write_csv(
                file="../data/games.tsv",
                separator="\t",
            )
        else:
            url_slugs = pl.read_csv("../data/games.tsv").to_series().to_list()
        scrape_games(s, url_slugs)


scraper(ROOT)

In [8]:
def create_template_sitemap(url_slugs: list[str] = []) -> str:
    """Return string adhereing to XML schema for the Sitemap protocol
    containing user defined dummy urls.

    Utility function to construct body content for mock http responses.

    Args:
        url_slugs: List of game url slugs used in constructing each
            url within the sitemap urlset.
            Uses slugs instead of full urls because every game should
            already share the same url root.
    """
    start_tags = "<url><loc>"
    end_tags = "</loc><changefreq>weekly</changefreq></url>"
    # The empty string element is a special case where the url root is also omitted.
    url_generator = (
        f"{start_tags}{end_tags}"
        if slug == ""
        else f"{start_tags}{ROOT}/{slug}/{end_tags}"
        for slug in url_slugs
    )
    return (
        f'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
        f'{"".join(url_generator)}'
        f"</urlset>"
    )

In [None]:
import responses
import unittest
from unittest.mock import patch, call


class TestPrototype(unittest.TestCase):
    @responses.activate
    def test_scrape_sitemaps(self):
        responses.get(
            url=f"{ROOT}s/1.xml",
            status=200,
            body=create_template_sitemap(["dark-summit", "warhawk", "dark-summit"]),
        )
        responses.get(
            url=f"{ROOT}s/1.xml",
            status=200,
            body=create_template_sitemap([""]),
        )
        responses.get(
            url=f"{ROOT}s/1.xml",
            status=404,
            body=create_template_sitemap(["dark-summit", "warhawk", "dark-summit"]),
        )
        responses.get(
            url=f"{ROOT}s/2.xml",
            status=200,
            body=create_template_sitemap(["warhawk", "metal-slug-2"]),
        )
        responses.get(
            url=f"{ROOT}s/3.xml",
            status=200,
            body=create_template_sitemap(),
        )
        with requests.Session() as s:
            self.assertCountEqual(
                scrape_sitemaps(s), ["dark-summit", "warhawk", "metal-slug-2"]
            )
            self.assertCountEqual(scrape_sitemaps(s), ["warhawk", "metal-slug-2"])
            with patch("builtins.print") as mocked_print:
                self.assertCountEqual(scrape_sitemaps(s), ["warhawk", "metal-slug-2"])
                self.assertEqual(
                    mocked_print.mock_calls,
                    [call("Sitemap page 1 failed to respond, skipped.")],
                )


unittest.main(argv=[""], verbosity=2, exit=False)