In [None]:
%%sh
pip install -qU sagemaker

In [None]:
import json
import pprint
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
from IPython.display import display, Markdown, Image

In [None]:
from sagemaker_streaming import print_event_stream

In [None]:
role = sagemaker.get_execution_role()

In [None]:
get_huggingface_llm_image_uri("huggingface", version="2.2.0"),

# Model Deployment

In [None]:
# https://huggingface.co/arcee-ai/Arcee-Nova
# Based on Qwen2 72B, 32K context size

model_id = "arcee-ai/Arcee-Nova"

In [None]:
# Dev/test deployment on g5/g6

hub = {
    "HF_MODEL_ID": model_id,
    'SM_NUM_GPUS': '4',
    'HF_MODEL_QUANTIZE': 'bitsandbytes-nf4',
    'MESSAGES_API_ENABLED': 'true',
}

#instance_type = "ml.g5.12xlarge"
instance_type = "ml.g6.12xlarge"

In [None]:
# Production deployment on p4

hub = {
    "HF_MODEL_ID": model_id,
    'SM_NUM_GPUS': '8',
    "MAX_INPUT_LENGTH": "16384",
    "MAX_TOTAL_TOKENS": "32768",
    'MESSAGES_API_ENABLED': 'true',
}

instance_type = "ml.p4d.24xlarge"

In [None]:
huggingface_model = HuggingFaceModel(
    image_uri=get_huggingface_llm_image_uri("huggingface", version="2.2.0"),
    env=hub,
    role=role
)

predictor = huggingface_model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    model_data_download_timeout=3600,
    container_startup_health_check_timeout=600,
)

# Model Inference

In [None]:
body = {
    "model": "tgi",
    "messages": [
        {"role": "system", "content": "As a friendly technical assistant engineer, answer the question in detail."},
        {"role": "user", "content": "Why are transformers better models than LSTM?"}
    ],
    "max_tokens": 1024,
}

In [None]:
%%time

# Default settings
# Only added to jump back and forth between batch and streaming
from sagemaker.base_deserializers import JSONDeserializer

predictor.deserializer = JSONDeserializer()

response = predictor.predict(body)

In [None]:
pprint.pprint(response)

In [None]:
display(Markdown(response["choices"][0]["message"]["content"]))

# Model Inference with Streaming Response

In [None]:
body = {
    "model": "tgi",
    "messages": [
        {"role": "system", "content": "As a friendly technical assistant engineer, answer the question in detail."},
        {"role": "user", "content": "Why are transformers better models than LSTM?"}
    ],
    "stream": True,
    "max_tokens": 1024,
}

In [None]:
smr = boto3.client("sagemaker-runtime")
response = smr.invoke_endpoint_with_response_stream(EndpointName=predictor.endpoint_name, Body=json.dumps(body), ContentType='application/json')
print_event_stream(response['Body'])

In [None]:
predictor.delete_model()
predictor.delete_endpoint()