In [None]:
from __future__ import annotations

import csv
from typing import Final, Iterator, Sequence

import _csv
import polars as pl
import requests
from bs4 import BeautifulSoup, SoupStrainer
from lxml import etree as et
from requests.adapters import HTTPAdapter, Retry

ROOT: Final[str] = "https://www.metacritic.com/game"
STR_LIST_DELIM: Final[str] = "|"

In [None]:
class SitemapsNotFoundError(Exception):
    pass


class IndexNotIntegerError(Exception):
    def __init__(self, msg: str = "Default Message") -> None:
        super().__init__(msg)

In [None]:
def get_last_sitemap_index(s: requests.Session) -> int:
    """Return index value of the last sitemap in the sitemap catalog.

    Args:
        s: Session object to fetch sitemap catalog over a persistent http connection.
    """
    sitemap_catalog = s.get(f"{ROOT}s.xml")
    sitemap_catalog.raise_for_status()
    catalog_etree = et.XML(sitemap_catalog.text)
    nsmap = {"ns": catalog_etree.nsmap[None]}
    try:
        # XPath query tested faster compared to ElementPath methods
        (last_url,) = catalog_etree.xpath(
            "(//ns:loc/text())[last()]",
            namespaces=nsmap,
            smart_strings=False,
        )
    except ValueError as e:
        raise SitemapsNotFoundError from e
    last_index_str = last_url.rpartition(".")[0].rpartition("/")[-1]
    try:
        last_index = int(last_index_str)
    except ValueError as e:
        raise IndexNotIntegerError(f"Last Index found to be: {last_index_str}") from e
    return last_index

In [None]:
def extract_url_slugs(loc_elements: Iterator[et._Element], url_slugs: set[str]) -> None:
    """Process a single sitemap's loc tags and update the set of URL slugs.

    Args:
        loc_elements: A generator of loc tags containing URL text.
        url_slugs: The set in which game URL slugs are to be stored.
    """
    for loc in loc_elements:
        try:
            url_slug = (
                loc.text
                .rstrip("/")  # pyright: ignore[reportOptionalMemberAccess]
                .rpartition("/")[-1]
            )  # fmt: skip
        except AttributeError:
            continue
        url_slugs.add(url_slug)

In [None]:
def scrape_sitemaps(s: requests.Session, last_index: int) -> list[str]:
    """Retrieve URL slugs associated with all games from sitemaps.

    Colloquially, a slug is the unique identifying part of a web address,
    typically at the end of the URL. Each sitemap contains approximately 1000 URLs.

    Args:
        s: Session object to fetch sitemaps over a persistent http connection.
        last_index: End value for iteration of all sitemaps. (There appear to
            be more unlisted sitemaps of higher indices in the catalog,
            but these appear to be duplicates or vestigial in nature).

    Returns:
        A list of URL slugs corresponding to every game indexed in metacritic's sitemaps.
    """
    url_slugs: set[str] = set()
    for i in range(1, last_index + 1):
        sitemap_response = s.get(f"{ROOT}s/{i}.xml")
        sitemap_response.raise_for_status()
        sitemap_etree = et.XML(sitemap_response.text)
        loc_elements = sitemap_etree.iterfind(".//loc", namespaces=sitemap_etree.nsmap)
        extract_url_slugs(loc_elements, url_slugs)
        print(f"Scraping of sitemap {i} complete", end="")
        print("\r", end="")

    return list(url_slugs)

In [None]:
def scrape_game_details(
    game_pg: requests.Response,
    game_writer: _csv._writer,
    game_dict: dict[str, str],
    game_url: str,
) -> None:
    """ """
    soup = BeautifulSoup(game_pg.content, features="lxml")
    processed_details: list[str | list[str]] = []
    for key, selection_str in game_dict.items():
        tag = soup.select_one(selection_str)
        try:
            if key == "Summary":
                # Appending urls here means summary is last in csv, improving readability
                processed_details.append(game_url)
                # Special case as game summary text is only found in full as an attribute value
                summary = (             # pyright: ignore[reportUnknownVariableType]
                    tag                 # pyright: ignore[reportUnknownMemberType]
                    .get("content")     # pyright: ignore[reportOptionalMemberAccess]
                    .replace("\t", "")  # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue]
                    .replace("\n", " ")
                    .strip()
                )  # fmt:skip
                processed_details.append(summary)  # pyright: ignore[reportUnknownArgumentType] # fmt: skip
            elif key in ["Platforms", "Developers", "Genres"]:
                processed_details.append(
                    STR_LIST_DELIM.join((li.get_text(strip=True) for li in (tag or [])))
                )
            else:
                stripped = tag.get_text(strip=True)  # pyright: ignore[reportOptionalMemberAccess] # fmt: skip
                if key == "Userscore":
                    try:
                        # Scale Userscore to int range for later memory optimization
                        processed_details.append(str(int(10 * float(stripped))))
                    except ValueError:
                        processed_details.append(stripped)
                elif key == "ESRB":
                    # Remove unnecessary "Rated " text preceding esrb label
                    processed_details.append(stripped.partition(" ")[-1])
                else:
                    processed_details.append(stripped)
        except AttributeError:
            processed_details.append("")
            continue
    game_writer.writerow(processed_details)

In [None]:
def scrape_critic_info(critic_pg: requests.Response) -> None:
    pass

In [None]:
def scrape_games(s: requests.Session, url_slugs: list[str]) -> None:
    """ """
    CSS_SELECTORS: Final[dict[str, str]] = {
        "Title":        ".c-productHero_title > div",
        "Metascore":    ".c-productHero_scoreInfo > div:first-child .c-siteReviewScore > span",
        "Userscore":    ".c-productHero_scoreInfo > div:last-child .c-siteReviewScore > span",
        "ESRB":         ".c-productionDetailsGame_esrb_title > span",
        "Platforms":    ".c-gameDetails_Platforms > ul",
        "Release_Date": ".c-gameDetails_ReleaseDate > span:last-child",
        "Developers":   ".c-gameDetails_Developer > ul",
        "Publisher":    ".c-gameDetails_Distributor > span:last-child",
        "Genres":       "ul.c-genreList",
        "Summary":      "meta[name='description']",
    }  # fmt: skip

    with open("../data/games_backup.tsv", "w", newline="") as game_file:
        game_writer = csv.writer(game_file, delimiter="\t")
        game_headers = list(CSS_SELECTORS.keys())
        game_headers.insert(-1, "Game_Url")
        game_writer.writerow(game_headers)
        for url_slug in url_slugs:
            game_url = f"{ROOT}/{url_slug}"
            game_pg = s.get(game_url)
            if not game_pg.ok and game_pg.status_code != 429:
                print("Error fetching game page")
                continue
            # critic_pg = s.get(f"{game_url}/critic-reviews")
            scrape_game_details(game_pg, game_writer, CSS_SELECTORS, game_url)
            # scrape_critic_info(critic_pg)

In [None]:
DEFAULT_SUMMARY = (
    "Metacritic aggregates music, game, tv, and movie reviews from the leading critics. "
    "Only Metacritic.com uses METASCORES, which let you know at a glance how each item was reviewed."
)
dtype_dict = {
    "Metascore": pl.UInt8,
    "Userscore": pl.UInt8,
    "ESRB": pl.Categorical,
    "Publisher": pl.Categorical,
}
x = pl.read_csv(
    "../data/games_backup.tsv",
    separator="\t",
    null_values=["tbd", DEFAULT_SUMMARY],
    dtypes=dtype_dict,
)
x = x.with_columns(
    pl.col("Release_Date").str.to_date("%b %d, %Y"),
    pl.col("Platforms").str.split(STR_LIST_DELIM),
    pl.col("Developers").str.split(STR_LIST_DELIM),
    pl.col("Genres").str.split(STR_LIST_DELIM),
)
x.explode("Genres").explode("Developers").explode("Platforms")

In [None]:
TEST_SLUGS = [
    "bioshock",
    "tilescape",
    "cieb-the-backrooms-project",
    "elden-ring",
]

In [None]:
def scraper(
    *,
    user_agent: str = "Edge",
    num_retries: int = 5,
    backoff_factor: float = 0.5,
    backoff_jitter: float = 0.5,
    status_forcelist: list[int] = [429],
    respect_retry_after_header: bool = False,
    games_backup_path: str = "../data/games_backup.tsv",
    slugs_backup_path: str = "../data/raw_url_slugs.tsv",
) -> None:
    """ """
    with requests.Session() as s:
        s.headers = {"User-Agent": user_agent}
        retry_config = Retry(
            total=num_retries,
            backoff_factor=backoff_factor,
            backoff_jitter=backoff_jitter,
            status_forcelist=status_forcelist,
            respect_retry_after_header=respect_retry_after_header,
        )
        s.mount("https://", HTTPAdapter(max_retries=retry_config))
        try:
            games_backup = pl.read_csv(games_backup_path)
        except FileNotFoundError:
            try:
                url_slugs = pl.read_csv(slugs_backup_path).to_series().to_list()
            except FileNotFoundError:
                print("Missing url slugs backup file...\nPreparing to rebuild")
                last_index = get_last_sitemap_index(s)
                print(
                    "Last sitemap index found, preparing to iterate over all catalogued sitemaps"
                )
                url_slugs = scrape_sitemaps(s, last_index)
                pl.DataFrame({"url_slugs": url_slugs}).write_csv(
                    file=slugs_backup_path,
                    separator="\t",
                )
                print("Url slugs backup rebuilt successfully")
            scrape_games(s, TEST_SLUGS)


scraper(games_backup_path="")

In [None]:
def create_template(
    url_slugs: Sequence[str | int] = [], *, is_index: bool = False
) -> str:
    """Return string adhereing to XML schema for the Sitemap protocol
    containing user defined dummy urls.

    Utility function to construct body content for mock http responses.
    Any type of string is accepted, but only ints or int-like strings are
    expected within the list if is_index is true (constructing a sitemap index).

    Args:
        url_slugs: Corresponds to game titles when constructing a sitemap, while
            represents ordinal values when constructing a sitemap index.
        is_index: Determines whether to construct a template for a sitemap
            or a sitemap index.
    """
    ns = 'xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
    container_start_tag = f"<urlset {ns}>"
    item_start_tag = "<url><loc>"
    ext = "/"
    item_end_tag = "</loc></url>"
    container_end_tag = "</urlset>"
    if is_index:
        container_start_tag = f"<sitemapindex {ns}>"
        item_start_tag = "<sitemap><loc>"
        ext = ".xml"
        item_end_tag = "</loc></sitemap>"
        container_end_tag = "</sitemapindex>"
    # The empty string element is a special case where the url root is also omitted.
    url_generator = (
        (
            f"{item_start_tag}{item_end_tag}"
            if slug == ""
            else f"{item_start_tag}{ROOT}/{slug}{ext}{item_end_tag}"
        )
        for slug in url_slugs
    )
    return f"{container_start_tag}{" ".join(url_generator)}{container_end_tag}"

In [None]:
import unittest

import responses


class TestPrototype(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.s = cls.enterClassContext(requests.Session())
        cls.r = cls.enterClassContext(responses.RequestsMock())

    def test_get_last_sitemap_index(self) -> None:
        statuses = [200] * 5 + [404]
        slugs_to_test = [
            [7],
            ["a", 20],
            [11, 3, 94],
            [],
            [20, "a"],
            [0, 0],
        ]
        for status, slugs in zip(statuses, slugs_to_test, strict=True):
            self.r.get(
                url=f"{ROOT}s.xml",
                status=status,
                body=create_template(slugs, is_index=True),
            )

        for i in range(3):
            self.assertEqual(slugs_to_test[i][-1], get_last_sitemap_index(self.s))
        self.assertRaises(SitemapsNotFoundError, get_last_sitemap_index, self.s)
        self.assertRaises(IndexNotIntegerError, get_last_sitemap_index, self.s)
        self.assertRaises(requests.HTTPError, get_last_sitemap_index, self.s)

    def test_scrape_sitemaps(self) -> None:
        statuses = [200, 200, 404]
        slugs_to_test = [
            ["dark-summit", "warhawk", "dark-summit"],
            [""],
            ["bioshock"],
        ]
        for status, slugs in zip(statuses, slugs_to_test, strict=True):
            self.r.get(
                url=f"{ROOT}s/1.xml",
                status=status,
                body=create_template(slugs),
            )
        self.r.get(
            url=f"{ROOT}s/2.xml",
            status=200,
            body=create_template(["warhawk", "metal-slug-2"]),
        )

        TEST_INDEX = 2
        self.assertCountEqual(
            ["dark-summit", "warhawk", "metal-slug-2"],
            scrape_sitemaps(self.s, TEST_INDEX),
        )
        self.assertCountEqual(
            ["warhawk", "metal-slug-2"],
            scrape_sitemaps(self.s, TEST_INDEX),
        )
        self.assertRaises(requests.HTTPError, scrape_sitemaps, self.s, TEST_INDEX)


unittest.main(argv=[""], verbosity=2, exit=False)