In [None]:
%pip install sagemaker --upgrade  --quiet

## Import the relevant libraries and configure global variables

In [None]:
import boto3
import sagemaker
import json
import io
import numpy as np
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
session = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = session._region_name  # region name of the current SageMaker Studio environment

In [None]:
local_code_dir = "LMI_qwen2_tp1_deploy"
!mkdir -p {local_code_dir}

In [None]:
%%writefile {local_code_dir}/serving.properties
engine=Python
# huggingface model id
# option.model_id=Qwen/Qwen2-0.5B-Instruct
# s3 uri
option.model_id=s3://sagemaker-us-east-1-633205212955/output-model/2408/
option.rolling_batch=vllm
option.max_model_len=512
option.max_rolling_batch_size=32
option.enforce_eager=true
option.trust_remote_code=True

#### We package the serving.properties configuration file in the tar.gz format, so that it meets SageMaker hosting requirements

In [None]:
!tar czvf {local_code_dir}.tar.gz {local_code_dir}

#### Configure the Image URI for the inference container

We configure the DJL LMI container with deepspeed as the backend engine. Also note that we are specifying the latest version of the container (0.26.0)

In [None]:
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.30.0-lmi12.0.0-cu124'

#### Next we upload the local tarball (containing the serving.properties configuration file) to an S3 prefix 

In [None]:
s3_code_prefix = "large-model-lmi/code"
bucket = session.default_bucket()  # bucket to house artifacts
code_artifact = session.upload_data(f"{local_code_dir}.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

## Create the SageMaker model object and Deploy the Model with the LMI container
 
We use the image URI for the DJL container and the s3 location to which the model serving artifacts tarball were uploaded, to create the SageMaker model object.

The container downloads the model into the `/tmp` space on the container because SageMaker maps the `/tmp` to the Amazon Elastic Block Store (Amazon EBS) volume that is mounted when we specify the endpoint creation parameter VolumeSizeInGB.

In [None]:
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

instance_type = "ml.g5.xlarge"
endpoint_name = sagemaker.utils.name_from_base("qwen2-lmi-")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name
            )

In [None]:
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session,
    serializer=serializers.JSONSerializer(),
)

In [None]:
predictor.predict(
    {"inputs": "tell me a story of the little red riding hood", "parameters": {"max_new_tokens":128, "do_sample":True}}
)

As we see above, even though the `max_new_token`paramater is set to 256, we use the `finish_reason` detail attribute as part of the output to chain multiple requests to the endpoint, until the entire output is generated.

## Cleanup the environment

In [None]:
session.delete_endpoint(endpoint_name)
session.delete_endpoint_config(endpoint_name)
model.delete_model()