# Deploying Hugging Face models on Amazon SageMaker Graviton instances

## 1. Import dependencies

In [None]:
%%sh
pip uninstall -qy autogluon
pip install -qU boto3 sagemaker

In [None]:
import datetime
import json
import pprint

import boto3
import sagemaker
from IPython.display import Markdown, display
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker_streaming import print_event_stream

In [None]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
runtime_sm_client = boto3.client("runtime.sagemaker")

## 2. Define the deployment parameters

In [None]:
# The URI of your ECR container (must be in the same region as SageMaker)
image_uri = YOUR_IMAGE_URI

# The prefix for the endpoint name (a timestamp will be added)
endpoint_name_prefix = "my-graviton-endpoint"

# The instance type for the endpoint (should be a Graviton3/4 instance)
real_time_inference_instance_type = "ml.c7g.8xlarge"

In [None]:
# A read-only Hugging Face token (only for private or gated models)
hf_token = YOUR_HUGGINGFACE_TOKEN

# llama-server flags, see 'llama-server -h'
llama_cpp_args = "--ctx-size 16384"

## 3. Pick a deployment option

In [None]:
# option 1: deploy a safetensors model from the HF hub, converting it to GGUF and quantizing it on the fly

model_environment = {
    # The Hugging Face repository id
    "HF_MODEL_ID": "arcee-ai/AFM-4.5B",
    # The quantization recipe to apply
    # If left out, the model will be deployed as F16
    "QUANTIZATION": "Q8_0",
    # A read-only Hugging Face token (required for private or gated models)
    "HF_TOKEN": hf_token,
    # llama-server flags
    'LLAMA_CPP_ARGS': llama_cpp_args
}

In [None]:
# option 2: deploy a GGUF model from the HF hub

model_environment = {
    # The Hugging Face repository id
    "HF_MODEL_ID": "arcee-ai/arcee-lite-GGUF",
    # The name of the GGUF file in the repository
    "MODEL_FILENAME": "arcee-lite-Q8_0.gguf",
    # llama-server flags
    'LLAMA_CPP_ARGS': llama_cpp_args
}

In [None]:
# option 3: deploy a safetensors model from S3, converting it to GGUF and quantizing it on the fly

model_environment = {
    # The S3 URI of your safetensors model (must be in the same region as SageMaker)
    # Downloaded with 'hf download arcee-ai/AFM-4.5B --local-dir afm-4.5-b' and then synced to S3
    "HF_MODEL_URI": "s3://YOUR_S3_BUCKET/afm-4.5-b/",
    # The quantization recipe to apply
    # If left out, the model will be deployed as F16
    "QUANTIZATION": "Q4_0",
    # llama-server flags
    "LLAMA_CPP_ARGS": llama_cpp_args
}

In [None]:
# option 4: deploy a GGUF model from S3 (the bucket must be in the same region as SageMaker)

model_environment = {
    # The S3 URI of your GGUF model (must be in the same region as SageMaker)
    "HF_MODEL_URI": "s3://YOUR_S3_BUCKET/",
    # The name of the GGUF file in the bucket
    # Downloaded with 'hf download arcee-ai/AFM-4.5B-GGUF AFM-4.5B-Q4_0.gguf --local-dir .' and then copied to S3
    "MODEL_FILENAME": "AFM-4.5B-Q4_0.gguf",
    # llama-server flags
    "LLAMA_CPP_ARGS": llama_cpp_args
}

## 4. Deploy the endpoint

In [None]:
# create a deployable model
model = Model(
    image_uri=image_uri,
    role=role,
    env=model_environment,
)

# create a unique endpoint name
timestamp = "{:%Y-%m-%d-%H-%M-%S}".format(datetime.datetime.now())
endpoint_name = f"{endpoint_name_prefix}-{timestamp}"
print(f"Deploying endpoint {endpoint_name}")

# deploy the model
response = model.deploy(
    initial_instance_count=1,
    instance_type=real_time_inference_instance_type,
    endpoint_name=endpoint_name,
    model_data_download_timeout=900,
    container_startup_health_check_timeout=300,
)

Once the endpoint is in service, you will be able to perform real-time inference.

## 5. Run synchronous inference

In [None]:
model_sample_input = {
    "messages": [
        {"role": "system", "content": "You are a friendly and helpful AI assistant."},
        {
            "role": "user",
            "content": "Suggest 5 names for a new neighborhood pet food store. Names should be short, fun, easy to remember, and respectful of pets. \
        Explain why customers would like them.",
        },
    ],
    "max_tokens": 1024,
}

In [None]:
response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps(model_sample_input),
)

output = json.loads(response["Body"].read().decode("utf8"))

In [None]:
pprint.pprint(output)

We can also print the generated output with Markdown formatting.

In [None]:
display(Markdown(output["choices"][0]["message"]["content"]))

## 6. Run streaming inference

In [None]:
model_sample_input = {
    "messages": [
        {
            "role": "system",
            "content": "As a friendly technical assistant engineer, answer the question in detail.",
        },
        {"role": "user", "content": "Why are transformers better models than LSTM? Explain step by step."},
    ],
    "max_tokens": 512,
    "stream": True
}

response = runtime_sm_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    Body=json.dumps(model_sample_input),
    ContentType='application/json'
)

print_event_stream(response['Body'])

Now that you have successfully performed a real-time inference, you do not need the endpoint any more. You can terminate the endpoint to avoid being charged.

## 7. Clean-up

Please don't forget to run the cells below to delete all resources and avoid unecessary charges.

In [None]:
model.sagemaker_session.delete_endpoint(endpoint_name)
model.sagemaker_session.delete_endpoint_config(endpoint_name)

In [None]:
model.delete_model()