tools/auto_fill_inference_cache.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Script to cache models for inference."""
import argparse
import json
import logging
import re
import subprocess
import tempfile
import time
from typing import Optional

import huggingface_hub
import requests
from huggingface_hub import get_token, login, logout

from optimum.exporters import TasksManager
from optimum.neuron import version as optimum_neuron_version
from optimum.neuron.utils.version_utils import get_neuronxcc_version


# Example usage:
# huggingface-cli login --token hf_xxx # access to cache repo
# python tools/auto_fill_inference_cache.py --hf_model_id "HuggingFaceH4/zephyr-7b-beta" --batch_size 1 --sequence_length 2048 --num_cores 2 --auto_cast_type fp16
# Alternative provide json config file as local file or remote file (https://) with the following formwat
# {
#    "meta-llama/Llama-2-7b-chat-hf": [
#        {  "batch_size": 1, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16" },
#        {  "batch_size": 2, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "bf16" }
#    ]
# }
# Local file Example usage:
# python tools/auto_fill_inference_cache.py --config_file test.json
# Remote file Example usage:
# python tools/auto_fill_inference_cache.py --config_file https://huggingface.co/aws-neuron/optimum-neuron-cache/raw/main/inference-cache-config/gpt2.json

# Setup logging
logging.basicConfig(level=logging.INFO, force=True)
logger = logging.getLogger()


def get_aws_neuronx_tools_version():
    output = subprocess.check_output(["apt", "show", "aws-neuronx-tools"], text=True)
    version_match = re.search(r"Version: ([\d\.]+)", output)

    if version_match:
        # extract the version number and remove the last two characters (not tracked in optimum)
        return version_match.group(1)[:-2]
    else:
        raise ValueError("Version information not found in the output")


def build_decoder_command(hf_model_id, batch_size, sequence_length, num_cores, auto_cast_type, output_dir):
    if None in [batch_size, sequence_length, num_cores, auto_cast_type]:
        raise ValueError(
            "You must provide --batch_size, --sequence_length, --num_cores and --auto_cast_type for compiling decoder models."
        )
    compile_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        hf_model_id,
        "--batch_size",
        str(batch_size),
        "--sequence_length",
        str(sequence_length),
        "--num_cores",
        str(num_cores),
        "--auto_cast_type",
        auto_cast_type,
        "--task",
        "text-generation",
        output_dir,
    ]
    return compile_command


def build_encoder_command(hf_model_id, task, batch_size, sequence_length, auto_cast, auto_cast_type, output_dir):
    if None in [task, batch_size, sequence_length, auto_cast, auto_cast_type]:
        raise ValueError(
            "You must provide --task, --batch_size, --sequence_length, --auto_cast and --auto_cast_type for compiling encoder models."
        )
    compile_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        hf_model_id,
        "--task",
        task,
        "--batch_size",
        str(batch_size),
        "--sequence_length",
        str(sequence_length),
        "--auto_cast",
        auto_cast,
        "--auto_cast_type",
        auto_cast_type,
        output_dir,
    ]
    return compile_command


def build_stable_diffusion_command(
    hf_model_id, task, batch_size, height, width, num_images_per_prompt, auto_cast, auto_cast_type, output_dir
):
    if None in [task, batch_size, height, width, auto_cast, auto_cast_type]:
        raise ValueError(
            "You must provide --task, --batch_size, --height, --width, --auto_cast and --auto_cast_type for compiling stable diffusion models."
        )
    compile_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        hf_model_id,
        "--task",
        task,
        "--batch_size",
        str(batch_size),
        "--height",
        str(height),
        "--width",
        str(width),
        "--num_images_per_prompt",
        str(num_images_per_prompt),
        "--auto_cast",
        auto_cast,
        "--auto_cast_type",
        auto_cast_type,
        output_dir,
    ]
    return compile_command


def compile_and_cache_model(
    hf_model_id: str,
    batch_size: int,
    sequence_length: Optional[int] = None,
    height: Optional[int] = None,
    width: Optional[int] = None,
    num_images_per_prompt: Optional[int] = None,
    num_cores: Optional[int] = None,
    task: Optional[str] = None,
    auto_cast: Optional[str] = None,
    auto_cast_type: Optional[str] = None,
):
    start = time.time()
    with tempfile.TemporaryDirectory() as temp_dir:
        if task is None:
            task = infer_task_from_model_path(hf_model_id)
        # Compile model with Optimum for specific configurations
        if task == "text-generation":
            compile_command = build_decoder_command(
                hf_model_id, batch_size, sequence_length, num_cores, auto_cast_type, temp_dir
            )
        elif "stable-diffusion" in task:
            compile_command = build_stable_diffusion_command(
                hf_model_id,
                task,
                batch_size,
                height,
                width,
                num_images_per_prompt,
                auto_cast,
                auto_cast_type,
                temp_dir,
            )
        else:
            compile_command = build_encoder_command(
                hf_model_id, task, batch_size, sequence_length, auto_cast, auto_cast_type, temp_dir
            )
        logger.info(f"Running compile command: {' '.join(compile_command)}")
        try:
            subprocess.run(compile_command, check=True)
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to compile model: {e}")
            return

        # Synchronize compiled model to Hugging Face Hub
        cache_sync_command = ["optimum-cli", "neuron", "cache", "synchronize"]
        logger.info(f"Running cache synchronize command: {' '.join(cache_sync_command)}")
        subprocess.run(cache_sync_command, check=True)

    # Log time taken
    logger.info(f"Compiled and cached model {hf_model_id} w{time.time() - start:.2f} seconds")


def infer_task_from_model_path(model_id: str):
    try:
        # Decoder: task=="text-generation"
        from transformers import AutoConfig

        config = AutoConfig.from_pretrained(model_id)
        model_type = config.model_type.replace("_", "-")
        model_tasks = TasksManager.get_supported_tasks_for_model_type(
            model_type, exporter="neuron", library_name="transformers"
        )
        if "text-generation" in model_tasks:
            task = "text-generation"
            return task
    except Exception:
        pass

    # TODO: Remove when https://github.com/huggingface/optimum/pull/1793/ is merged in Optimum
    try:
        task = TasksManager.infer_task_from_model(model_id)
    except KeyError:
        model_info = huggingface_hub.model_info(model_id)
        library_name = TasksManager.infer_library_from_model(model_id)
        if library_name == "diffusers":
            class_name = model_info.config["diffusers"].get("_class_name", None)
            task = "stable-diffusion-xl" if "StableDiffusionXL" in class_name else "stable-diffusion"
    return task


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Compile and cache a model to the Hugging Face Hub.")
    parser.add_argument("--hf_model_id", type=str, help="Hugging Face model ID to compile.")
    parser.add_argument("--task", type=str, help="Task for compilation (mandatory for encoders and stable diffusion).")
    parser.add_argument("--batch_size", type=int, help="Batch size for compilation.")
    parser.add_argument("--sequence_length", type=int, help="Sequence length for compilation.")
    parser.add_argument("--height", type=int, help="Image height for compilation.")
    parser.add_argument("--width", type=int, help="Image width for compilation.")
    parser.add_argument(
        "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt."
    )
    parser.add_argument("--num_cores", type=int, help="Number of cores for compilation.")
    parser.add_argument(
        "--auto_cast", type=str, choices=["none", "matmul", "all"], help="Operations to cast to lower precision."
    )
    parser.add_argument("--auto_cast_type", type=str, choices=["bf16", "fp16"], help="Auto cast type for compilation.")
    parser.add_argument("--hf_token", type=str, help="Hugging Face token for authentication if not logged in.")
    parser.add_argument("--config_file", type=str, help="Path to a json config file with model configurations.")
    args = parser.parse_args()

    # Ensure either HF token is provided or user is already logged in
    original_token = get_token()
    if args.hf_token:
        logger.info(f"Logging in to Hugging Face Hub with {args.hf_token[:10]}...")
        login(args.hf_token)
    else:
        logger.info("Trying to use existing Hugging Face Hub login or environment variable HF_TOKEN")

    # check and get neuronx-cc version
    neuronx_cc_version = get_neuronxcc_version()
    sdk_version = get_aws_neuronx_tools_version()
    logger.info(f"Compiler version: {neuronx_cc_version}")
    logger.info(f"Neuron SDK version: {sdk_version}")
    logger.info(f"Optimum Neuron version: {optimum_neuron_version.__version__}")
    logger.info(f"Compatible Optimum Neuron SDK version: {optimum_neuron_version.__sdk_version__} == {sdk_version}")

    # If a config file is provided, compile and cache all models in the file
    if args.config_file:
        logger.info(f"Compiling and caching models from config file: {args.config_file}")
        # check if config file starts with https://
        if args.config_file.startswith("https://"):
            response = requests.get(args.config_file)
            response.raise_for_status()
            config = response.json()
        else:
            with open(args.config_file, "r") as f:
                config = json.load(f)
        for model_id, configs in config.items():
            for model_config in configs:
                compile_and_cache_model(
                    hf_model_id=model_id,
                    batch_size=model_config["batch_size"],
                    sequence_length=model_config.get("sequence_length", None),
                    height=model_config.get("height", None),
                    width=model_config.get("width", None),
                    num_images_per_prompt=model_config.get("num_images_per_prompt", 1),
                    num_cores=model_config.get("num_cores", None),
                    task=model_config.get("task", None),
                    auto_cast=model_config.get("auto_cast", None),
                    auto_cast_type=model_config.get("auto_cast_type", None),
                )
    elif args.hf_model_id is None:
        raise ValueError("You must provide --hf_model_id to compile a model without a config file.")
    else:
        compile_and_cache_model(
            hf_model_id=args.hf_model_id,
            batch_size=args.batch_size,
            sequence_length=args.sequence_length,
            height=args.height,
            width=args.width,
            num_images_per_prompt=args.width,
            num_cores=args.num_cores,
            task=args.task,
            auto_cast=args.auto_cast,
            auto_cast_type=args.auto_cast_type,
        )

    # Restore hub login
    if original_token:
        login(original_token)
    else:
        logout()