In [None]:
from __future__ import annotations

import cchardet
import lxml
import os
import polars as pl
import requests

from bs4 import BeautifulSoup, SoupStrainer
from lxml import etree as et
from requests.adapters import HTTPAdapter, Retry
from typing import Iterator

ROOT = "https://www.metacritic.com/game"

In [None]:
def get_last_sitemap_index(s: requests.Session) -> int:
    """Return index value of the last sitemap in the sitemap catalog.

    Args:
        s: Session object to fetch sitemap catalog over a persistent http connection.
    """
    sitemap_catalog = s.get(f"{ROOT}s.xml")
    sitemap_catalog.raise_for_status()
    catalog_etree = et.XML(sitemap_catalog.text)
    nsmap = {"ns": catalog_etree.nsmap[None]}
    # XPath query tested faster compared to ElementPath methods
    last_url = catalog_etree.xpath(
        "(//ns:loc/text())[last()]",
        namespaces=nsmap,
        smart_strings=False,
    )
    if not last_url:
        raise IndexError("No sitemaps found.")
    last_index = int(last_url[0].rpartition("/")[-1].partition(".")[0])
    return last_index

In [None]:
def extract_url_slugs(loc_elements: Iterator[et._Element], url_slugs: set[str]) -> None:
    """Process a single sitemap's loc tags and update the set of URL slugs.

    Args:
        loc_elements: A generator of loc tags containing URL text.
        url_slugs: The set in which game URL slugs are to be stored.
    """
    for loc in loc_elements:
        if (url := loc.text) is None:
            continue
        url_slug = url.rstrip("/").rpartition("/")[-1]
        url_slugs.add(url_slug)

In [None]:
def scrape_sitemaps(s: requests.Session, last_index: int) -> list[str]:
    """Retrieve URL slugs associated with all games from sitemaps.

    Colloquially, a slug is the unique identifying part of a web address,
    typically at the end of the URL. Each sitemap contains approximately 1000 URLs.

    Args:
        s: Session object to fetch sitemaps over a persistent http connection.
        last_index: End value for iteration of all sitemaps. (There appear to
            be more unlisted sitemaps of higher indices in the catalog,
            but these appear to be duplicates or vestigial in nature).

    Returns:
        A list of URL slugs corresponding to every game indexed in metacritic's sitemaps.
    """
    url_slugs: set[str] = set()
    for i in range(1, last_index + 1):
        sitemap_response = s.get(f"{ROOT}s/{i}.xml")
        sitemap_response.raise_for_status()
        sitemap_etree = et.XML(sitemap_response.text)
        loc_elements = sitemap_etree.iterfind(".//loc", namespaces=sitemap_etree.nsmap)
        extract_url_slugs(loc_elements, url_slugs)

    return list(url_slugs)

In [None]:
def scrape_games(s: requests.Session, url_slugs: list[str]) -> None:
    """ """
    pass

In [None]:
def scraper(
    root_url: str,
    user_agent: str = "Edge",
    max_retries: int = 5,
) -> None:
    """ """
    with requests.Session() as s:
        s.headers = {"User-Agent": user_agent}
        retry_config = Retry(
            total=max_retries,
            backoff_factor=0.5,
            backoff_jitter=0.5,
            status_forcelist=[429],
            respect_retry_after_header=False,
        )
        s.mount("https://", HTTPAdapter(max_retries=retry_config))
        if not os.path.isfile("../data/games.tsv"):
            last_index = get_last_sitemap_index(s)
            url_slugs = scrape_sitemaps(s, last_index)
            pl.DataFrame({"url_slugs": url_slugs}).write_csv(
                file="../data/games.tsv",
                separator="\t",
            )
        else:
            url_slugs = pl.read_csv("../data/games.tsv").to_series().to_list()
        scrape_games(s, url_slugs)


scraper(ROOT)

In [None]:
def create_template(url_slugs: list[str | int] = [], *, isIndex: bool = False) -> str:
    """Return string adhereing to XML schema for the Sitemap protocol
    containing user defined dummy urls.

    Utility function to construct body content for mock http responses.
    Any type of string is accepted, but only ints or int-like strings are
    expected within the list if isIndex is true (constructing a sitemap index).

    Args:
        url_slugs: Corresponds to game titles when constructing a sitemap, while
            represents ordinal values when constructing a sitemap index.
        isIndex: Determines whether to construct a template for a sitemap
            or a sitemap index.
    """
    ns = 'xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
    container_start_tag = f"<urlset {ns}>"
    item_start_tag = "<url><loc>"
    ext = "/"
    item_end_tag = "</loc></url>"
    container_end_tag = "</urlset>"
    if isIndex:
        container_start_tag = f"<sitemapindex {ns}>"
        item_start_tag = "<sitemap><loc>"
        ext = ".xml"
        item_end_tag = "</loc></sitemap>"
        container_end_tag = "</sitemapindex>"
    # The empty string element is a special case where the url root is also omitted.
    url_generator = (
        (
            f"{item_start_tag}{item_end_tag}"
            if slug == ""
            else f"{item_start_tag}{ROOT}/{slug}{ext}{item_end_tag}"
        )
        for slug in url_slugs
    )
    return f"{container_start_tag}{" ".join(url_generator)}{container_end_tag}"

In [None]:
import responses
import unittest


class TestPrototype(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.s = cls.enterClassContext(requests.Session())
        cls.r = cls.enterClassContext(responses.RequestsMock())

    def test_get_last_sitemap_index(self) -> None:
        self.r.get(
            url=f"{ROOT}s.xml",
            status=200,
            body=create_template([7], isIndex=True),
        )
        self.r.get(
            url=f"{ROOT}s.xml",
            status=200,
            body=create_template(["a", 20], isIndex=True),
        )
        self.r.get(
            url=f"{ROOT}s.xml",
            status=200,
            body=create_template([11, 3, 94], isIndex=True),
        )
        self.r.get(
            url=f"{ROOT}s.xml",
            status=404,
            body=create_template([0, 0], isIndex=True),
        )
        self.r.get(
            url=f"{ROOT}s.xml",
            status=200,
            body=create_template(isIndex=True),
        )
        self.r.get(
            url=f"{ROOT}s.xml",
            status=200,
            body=create_template([20, "a"], isIndex=True),
        )
        self.assertEqual(get_last_sitemap_index(self.s), 7)
        self.assertEqual(get_last_sitemap_index(self.s), 20)
        self.assertEqual(get_last_sitemap_index(self.s), 94)
        self.assertRaises(requests.HTTPError, get_last_sitemap_index, self.s)
        self.assertRaisesRegex(
            IndexError, "No sitemaps found.", get_last_sitemap_index, self.s
        )
        self.assertRaises(ValueError, get_last_sitemap_index, self.s)

    def test_scrape_sitemaps(self) -> None:
        self.r.get(
            url=f"{ROOT}s/1.xml",
            status=200,
            body=create_template(["dark-summit", "warhawk", "dark-summit"]),
        )
        self.r.get(
            url=f"{ROOT}s/1.xml",
            status=200,
            body=create_template([""]),
        )
        self.r.get(
            url=f"{ROOT}s/1.xml",
            status=404,
            body=create_template(["dark-summit", "warhawk", "dark-summit"]),
        )
        self.r.get(
            url=f"{ROOT}s/2.xml",
            status=200,
            body=create_template(["warhawk", "metal-slug-2"]),
        )
        TEST_INDEX = 2
        self.assertCountEqual(
            scrape_sitemaps(self.s, TEST_INDEX),
            ["dark-summit", "warhawk", "metal-slug-2"],
        )
        self.assertCountEqual(
            scrape_sitemaps(self.s, TEST_INDEX),
            ["warhawk", "metal-slug-2"],
        )
        self.assertRaises(requests.HTTPError, scrape_sitemaps, self.s, TEST_INDEX)


unittest.main(argv=[""], verbosity=2, exit=False)