In [150]:
from __future__ import annotations

import cchardet
import lxml
import pandas as pd
import requests

from bs4 import BeautifulSoup, SoupStrainer
from lxml import etree
from requests.adapters import HTTPAdapter, Retry

ROOT = "https://www.metacritic.com/game"

In [151]:
def scrape_sitemaps(
    root_url: str,
    user_agent: str = "Edge",
    max_retries: int = 5,
) -> set[str]:
    """ """
    with requests.Session() as s:
        s.headers = {"User-Agent": user_agent}
        retry_config = Retry(
            total=max_retries,
            backoff_factor=0.5,
            backoff_jitter=0.5,
            status_forcelist=[429],
            respect_retry_after_header=False,
        )
        s.mount("https://", HTTPAdapter(max_retries=retry_config))

        i = 1
        url_slugs: set[str] = set()
        while True:
            sitemap = s.get(f"{ROOT}s/{i}.xml")
            sitemap_xml = etree.XML(sitemap.text)
            loc_tags = sitemap_xml.findall(".//{*}loc")
            if not loc_tags:
                break

            for loc in loc_tags:
                if (url := loc.text) is not None:
                    url_slug = url.rstrip("/").rpartition("/")[-1]
                    url_slugs.add(url_slug)
            i += 1
    return url_slugs


url_slugs = scrape_sitemaps(ROOT)

In [152]:
pd.DataFrame(url_slugs, columns=["url_slugs"]).to_csv(
    path_or_buf="games.tsv",
    sep="\t",
    index=False,
)

In [153]:
pd.read_csv("games.tsv")

Unnamed: 0,url_slugs
0,icopter-hd
1,cryptogrammer
2,damned-nation
3,exizency
4,cheating-tom-4-hair-stylist-wannabe
...,...
156582,towing-race
156583,plows-in-hell
156584,a-bird-story
156585,the-cursed-crusade
