In [None]:
import json
import re
from typing import TypedDict, cast

import utils
from rich.progress import track


def get_maker_urls():
    soup = utils.soup("https://www.onlinecarparts.co.uk/car-brands.html")
    return [utils.get_link(el) for el in soup.select(".top-auto .top-auto-item")]


def parse_car_permalink(permalink: str) -> str:
    r = re.match(
        r"https://www.onlinecarparts.co.uk/car-brands/spare-parts-(.+?)/(.+).html",
        permalink,
    )
    maker, model = r.groups()
    return f"{maker}/{model}"


def get_cars(maker_url: str):
    soup = utils.soup(maker_url)
    return [
        (
            utils.get_link(el),
            cast(str, el.select_one("img")["src"]),
        )
        for el in soup.select(".top-auto .top-auto-item")
    ]


def get_car_permalinks(car_url: str):
    soup = utils.soup(car_url)
    links = [utils.get_link(el) for el in soup.select(".vehicle-list .vehicle-list__link")]
    return [parse_car_permalink(link) for link in links]


maker_urls = get_maker_urls()
cars = utils.flatten([get_cars(maker_url) for maker_url in track(maker_urls, "Getting cars")])


class Record(TypedDict):
    permalink: str
    image: str


records = utils.flatten(
    [
        [Record(permalink=permalink, image=img) for permalink in get_car_permalinks(url)]
        for url, img in track(cars, "Getting permalinks")
    ]
)

data = utils.mkpath("data/ocp_car_images")
record_file = data / "records.json"

with record_file.open("w") as f:
    json.dump(records, f, indent=4)

----

In [None]:
import json
from typing import TypedDict, cast
import utils
import re


class Record(TypedDict):
    permalink: str
    image: str


data = utils.mkpath("data/ocp_car_images")
record_file = data / "records.json"


with record_file.open("r") as f:
    records: list[Record] = json.load(f)


def get_image_id(url: str):
    return re.match(r"https://scdn.autoteiledirekt.de/groups/170x100/(\d+).png", url).group(1)

In [None]:

import utils
from concurrent_download import download_concurrent

image_path = utils.mkpath(data / "images")


def get_img_path(url: str):
    name = get_image_id(url)
    return image_path / f"{name}.png"


image_set = {r["image"] for r in records}
await download_concurrent(image_set, 20, get_img_path)

In [None]:
from db import Session, Image, Car
from sqlalchemy import column, insert, select, table
from rich.progress import track

path_template = "images/cars/{}.png"
with Session() as session:
    for record in track(records):
        path = path_template.format(get_image_id(record["image"]))
        permalink = record["permalink"]
        cars = session.scalars(select(Car).where(Car.permalink.contains(permalink))).all()
        images = [dict(path=path, imageable_id=car.id, imageable_type=r"\App\Models\Car") for car in cars]
        session.execute(insert(Image).values(images).prefix_with("IGNORE"))
    session.commit()