In [None]:
%%sh
pip install -qU pip sagemaker

In [None]:
import json
import pprint
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
from IPython.display import display, Markdown, Image

In [None]:
from sagemaker_streaming import print_event_stream

In [None]:
role = sagemaker.get_execution_role()

# Model Deployment

In [None]:
model_id = "arcee-ai/Llama-Spark"

instance_type = "ml.g5.2xlarge"

hub = {
    "HF_MODEL_ID": model_id,
    'SM_NUM_GPUS': '1',
    'MESSAGES_API_ENABLED': "true"
}

In [None]:
huggingface_model = HuggingFaceModel(
    image_uri=get_huggingface_llm_image_uri("huggingface", version="2.2.0"),
    env=hub,
    role=role
)

predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=300,
  )

# Model Inference

In [None]:
body = {
    "model": "tgi",
    "messages": [
        {"role": "system", "content": "As a friendly technical assistant engineer, answer the question in detail."},
        {"role": "user", "content": "Why are transformers better models than LSTM?"}
    ],
    "max_tokens": 1024,
}

In [None]:
%%time
# Default settings
# Only added to jump back and forth between batch and streaming
from sagemaker.base_deserializers import JSONDeserializer

predictor.deserializer = JSONDeserializer()

response = predictor.predict(body)

In [None]:
pprint.pprint(response)

In [None]:
display(Markdown(response["choices"][0]["message"]["content"]))

# Model Inference with Streaming Response

In [None]:
body = {
    "model": "tgi",
    "messages": [
        {"role": "system", "content": "As a friendly technical assistant engineer, answer the question in detail."},
        {"role": "user", "content": "Why are transformers better models than LSTM?"}
    ],
    "stream": True,
    "max_tokens": 1024,
}

In [None]:
smr = boto3.client("sagemaker-runtime")
response = smr.invoke_endpoint_with_response_stream(EndpointName=predictor.endpoint_name, Body=json.dumps(body), ContentType='application/json')
print_event_stream(response['Body'])

# Model Inference with Streaming Response and additional context

In [None]:
Image("chatgpt.png")

In [None]:
body["messages"] = [
        {"role": "system", "content": "As a friendly technical assistant engineer, answer the question in detail."},
        {"role": "user", "content": "Is cybertron the ancestor of deep learning?"}
]

response = smr.invoke_endpoint_with_response_stream(EndpointName=predictor.endpoint_name, Body=json.dumps(body), ContentType='application/json')
print_event_stream(response['Body'])

In [None]:
with open("machine-learning-wikipedia.txt", "r") as file:
    context = file.read()

In [None]:
body["messages"] = [
    {"role": "system", "content": "As a friendly technical assistant engineer, use the provided context to answer the question in detail."},
    {"role": "user", "content": f"Is cybertron the ancestor of deep learning? Context: {context}"}
]

response = smr.invoke_endpoint_with_response_stream(EndpointName=predictor.endpoint_name, Body=json.dumps(body), ContentType='application/json')
print_event_stream(response['Body'])

In [None]:
predictor.delete_model()
predictor.delete_endpoint()