In [1]:
import warnings
import zipfile
import random
import os
import requests
import shutil
from justhtml import JustHTML
from shutil import unpack_archive
from tempfile import TemporaryDirectory

def load_page(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text

def download_file(url, dir: str | os.PathLike, ext_blacklist=None, dl_dir: str | os.PathLike | None = None):
    r = requests.get(url, stream=True, timeout=60)
    r.raise_for_status()

    fname = r.headers.get("Content-Disposition")
    if fname is None: fname = str(random.random())

    if ext_blacklist is not None:
        if isinstance(ext_blacklist, str): ext_blacklist = (ext_blacklist, )
        if fname.strip().lower().endswith(tuple(ext_blacklist)):
            return None

    if fname in os.listdir(dl_dir):
        return None

    with open(os.path.join(dir, fname), 'wb') as f:
        shutil.copyfileobj(r.raw, f) # type:ignore

    return os.path.join(dir, fname)

def get_topics(html: JustHTML) -> set[str]:
    topics = set()
    for el in html.query("div > a"):
        if "href" in el.attrs:
            href = el.attrs["href"]
            if href.startswith("./viewtopic.php?t="):
                topics.add(f"https://forum.generally-racers.com/{href[2:]}")
    return topics

def get_download_links(html: JustHTML):
    links = set()
    for el in html.query("dt > a"):
        if "href" in el.attrs:
            href = el.attrs["href"]
            if href.startswith("./download"):
                links.add(f"https://forum.generally-racers.com/{href[2:]}")
    return links

def get_current_page_number(html: JustHTML):
    for el in html.query("ul > li"):
        if el.attrs.get("class",  None) == "active":
            return int(el.children[0].to_text())
    return 1

def get_next_page_url(html: JustHTML):
    page_number = get_current_page_number(html)

    for el in html.query("ul > li > a"):
        if (el.attrs.get("class",  None) == "button") and ("href" in el.attrs):
            try:
                n = int(el.to_text())
            except (ValueError, TypeError):
                continue

            if n == page_number + 1:
                href = el.attrs["href"]
                return f"https://forum.generally-racers.com/{href[2:]}"

    return None


def unpack(file, dir):
    try:
        shutil.unpack_archive(file, dir)
    except Exception:
        with zipfile.ZipFile(file, "r") as zip_ref:
            zip_ref.extractall(dir)

def get_car_trk_files(file: str | os.PathLike, download_path: str | os.PathLike) -> None:
    if str(file).lower().strip().endswith((".car", ".trk")):
        if os.path.basename(file) in os.listdir(download_path):
            print(f"info: {os.path.basename(file)} already exists, skipping")
            return
        print(f"saved {os.path.basename(file)}")
        shutil.move(file, os.path.join(download_path, os.path.basename(file)))
    else:
        try:
            with TemporaryDirectory() as tmpdir:
                unpack(file, tmpdir)
                for root, dirs, files in os.walk(tmpdir):
                    for f in files:
                        get_car_trk_files(os.path.join(root, f), download_path)
        except Exception:
            pass

_BLACKLIST = (".jpeg", ".jpg", ".png", ".gif", ".mp4", ".mov", ".mp3", ".wav", ".tiff")
class GeneRallyCarDownloader:
    def __init__(self, download_path: str | os.PathLike):
        self.download_urls: set[str] = set()
        self.visited_pages: set[tuple[str, int]] = set()
        self.download_path = download_path

    def scrap_page(self, html: JustHTML):
        """downloads all files and returns url of next page or none"""
        dl_links = get_download_links(html)
        for dl_link in dl_links:
            if "&mode=view" in dl_link: continue
            with TemporaryDirectory() as tmpdir:
                try:
                    path = download_file(dl_link, tmpdir, _BLACKLIST, self.download_path)
                except Exception as e:
                    print(f"EXCEPTION WHILE DOWNLOADING {dl_link}:\n{e.__class__.__name__}: {e}")
                    continue

                if path is not None:
                    get_car_trk_files(path, self.download_path)

    def run(self, root = "https://forum.generally-racers.com/viewforum.php?f=7"):
        current_page_url = root
        current_html = JustHTML(load_page(current_page_url))
        page_idx = 1
        while True:
            print(f"# ------------------------ PROCESSING PAGE {page_idx} ------------------------ #")
            topics = get_topics(current_html)
            for topic_url in topics:
                topic_html = JustHTML(load_page(topic_url))
                topic_page_idx = 1
                while True:
                    self.scrap_page(topic_html)
                    next_url = get_next_page_url(topic_html)
                    if next_url is None: break
                    topic_html = JustHTML(load_page(next_url))
                    topic_page_idx += 1
                print(f'Processed {topic_page_idx} pages from {topic_url}')

            current_page_url = get_next_page_url(current_html)
            if current_page_url is None: break
            current_html = JustHTML(load_page(current_page_url))
            page_idx += 1

scrapper = GeneRallyCarDownloader("/var/mnt/issd/files 2/programming/experiments/generally scrapper/downloads")
scrapper.run()

# ------------------------ PROCESSING PAGE 1 ------------------------ #
info: Herbal P27 'Buzzard'.car already exists, skipping
info: Herbal P17 'Hawk'.car already exists, skipping
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5024&sid=669f8b07e8336b3c8772102f705e8d7f
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5464&sid=669f8b07e8336b3c8772102f705e8d7f
info: GRASCAR Gen 7.car already exists, skipping
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5919&sid=669f8b07e8336b3c8772102f705e8d7f
info: Nike One2022.car already exists, skipping
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5053&sid=669f8b07e8336b3c8772102f705e8d7f
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5418&sid=669f8b07e8336b3c8772102f705e8d7f
info: GP67_V2.car already exists, skipping
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5722&sid=669f8b07e8336b3c

In [2]:
scrapper.run("https://forum.generally-racers.com/viewforum.php?f=22")

# ------------------------ PROCESSING PAGE 1 ------------------------ #
saved Charger.car
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=579&sid=daa5c63db0edbe973f5dcba2639f321f
saved Knauerhase Lynx WJ.car
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=712&sid=daa5c63db0edbe973f5dcba2639f321f
saved Yukon Improved.car
saved WA 24th scale TAFCs.car
saved Volvo 244DL.car
saved VK Commodore Berlina.car
saved Toyota Sprinter tuning.car
saved Toyota Prius.car
saved Topolino.car
saved The Hitman.car
info: Subaru Impreza.car already exists, skipping
saved Studebaker ATD.car
saved Sauber C9.car
saved Renault Espace F1.car
saved Ranger.car
saved Race Trailer.car
saved Pontiac Funny Car.car
saved Peugeot 905LM Zonnebloem.car
saved Peugeot 905LM.car
saved Pagani Zonda.car
saved Mustang Doorslammers.car
saved Monza Funny Car.car
saved MD's Road Cars.car
saved McLaren Prototype.car
info: Mazda RX-7.car already exists, skipping
saved Mach 5 and 6

In [3]:
scrapper.run("https://forum.generally-racers.com/viewforum.php?f=40")

# ------------------------ PROCESSING PAGE 1 ------------------------ #
Processed 2 pages from https://forum.generally-racers.com/viewtopic.php?t=839&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5402&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 2 pages from https://forum.generally-racers.com/viewtopic.php?t=527&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=1361&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=2824&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=4053&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5785&sid=dabd41fc62d532b165d5f7f234e33ba4
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=4890&sid=dabd41fc62d532b165d5f7f234e33ba4
Pr

In [4]:
scrapper.run("https://forum.generally-racers.com/viewforum.php?f=8")

# ------------------------ PROCESSING PAGE 1 ------------------------ #
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=5785&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=3433&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=959&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=1752&sid=e58f16032ca23afc66c69f675c0017dd
Processed 2 pages from https://forum.generally-racers.com/viewtopic.php?t=318&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=736&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=312&sid=e58f16032ca23afc66c69f675c0017dd
Processed 1 pages from https://forum.generally-racers.com/viewtopic.php?t=717&sid=e58f16032ca23afc66c69f675c0017dd
Proce