libratom/scripts/get_media_type_list.py

#!/usr/bin/env python
# pylint: disable=missing-docstring

import csv
import json
import logging
from pathlib import Path
from tempfile import TemporaryDirectory

import click
import click_log

from libratom.cli import PATH_METAVAR
from libratom.cli.cli import set_log_level_from_verbose
from libratom.cli.utils import PathPath, validate_out_path
from libratom.lib.download import download_files

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])


# Set configuration on the root logger
click_log.basic_config(logging.getLogger())


@click.command(context_settings=CONTEXT_SETTINGS)
@click.option(
    "-v",
    "--verbose",
    count=True,
    callback=set_log_level_from_verbose,
    help="Increase verbosity (can be repeated).",
    expose_value=False,
)
@click.option(
    "-o",
    "--out",
    metavar=PATH_METAVAR,
    default=Path("media_types.json"),
    callback=validate_out_path,
    type=PathPath(resolve_path=True),
    help=f"Write the output to {PATH_METAVAR}.",
)
def download_media_type_files(out) -> None:
    """Download media type files from https://www.iana.org/ and write a JSON file of all media types.
    """

    media_types = []

    media_type_registries = [
        "application",
        "audio",
        "font",
        "image",
        "message",
        "model",
        "multipart",
        "text",
        "video",
    ]

    # CSV files to download
    urls = [
        f"https://www.iana.org/assignments/media-types/{registry}.csv"
        for registry in media_type_registries
    ]

    with TemporaryDirectory() as tmpdir:
        directory = Path(tmpdir)
        download_files(urls, directory, dry_run=False)

        for file in directory.glob("*.csv"):
            with file.open(newline="") as csvfile:
                reader = csv.reader(csvfile)

                # Use the first token (Name) in each row, skip headers
                # The split is to strip DEPRECATED/OBSOLETED/... mentions appended to the name
                for [name, *_] in reader:
                    if name != "Name":
                        media_types.append(f"{file.stem}/{name.split(maxsplit=1)[0]}")

    with out.open(mode="w") as f:
        json.dump(sorted(media_types), f, indent=4)


if __name__ == "__main__":
    download_media_type_files()