In [1]:
import json
import re
from typing import TypedDict, cast

import utils
from concurrent_download import download_concurrent
from rich.progress import track


class Record(TypedDict):
    permalink: str
    image: str

In [2]:
data = utils.mkpath("data/ocp_car_images")
record_file = data / "records.json"

----

In [3]:
with record_file.open("r") as f:
    records = json.load(f)

image_path = utils.mkpath(data / "images")

def get_img_path(url: str):
    name = re.match(r"https://scdn.autoteiledirekt.de/groups/170x100/(\d+).png", url).group(1)
    return image_path / f"{name}.png"

In [4]:
image_set = {r["image"] for r in records}
await download_concurrent(image_set, 20, get_img_path)

Output()

----

In [None]:
def get_maker_urls():
    soup = utils.soup("https://www.onlinecarparts.co.uk/car-brands.html")
    return [utils.get_link(el) for el in soup.select(".top-auto .top-auto-item")]


def parse_car_permalink(permalink: str) -> str:
    r = re.match(
        r"https://www.onlinecarparts.co.uk/car-brands/spare-parts-(.+?)/(.+).html",
        permalink,
    )
    maker, model = r.groups()
    return f"{maker}/{maker}-{model}"


def get_cars(maker_url: str):
    soup = utils.soup(maker_url)
    return [
        (
            utils.get_link(el),
            cast(str, el.select_one("img")["src"]),
        )
        for el in soup.select(".top-auto .top-auto-item")
    ]


def get_car_permalinks(car_url: str):
    soup = utils.soup(car_url)
    links = [utils.get_link(el) for el in soup.select(".vehicle-list .vehicle-list__link")]
    return [parse_car_permalink(link) for link in links]

In [None]:
maker_urls = get_maker_urls()
cars = utils.flatten([get_cars(maker_url) for maker_url in track(maker_urls, "Getting cars")])

In [None]:
records = utils.flatten(
    [
        [Record(permalink=permalink, image=img) for permalink in get_car_permalinks(url)]
        for url, img in track(cars, "Getting permalinks")
    ]
)

In [None]:
with record_file.open("w") as f:
    json.dump(records, f, indent=4)