# Model deployment with HuggingFace Accelerate engine integrated in LMI (Large Moder Inference Container) 

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

<mark>**NOTE**
: Copy the S3 path where the Training Job saves the model artifacts to. And give it to the option.s3url entry.</mark>


In this sample notebook, we use <mark>DeepSpeed</mark> engine.

In [None]:
%%writefile serving.properties
# engine=Python
engine=DeepSpeed
option.tensor_parallel_degree=1
#option.model_id=TheBloke/Wizard-Vicuna-7B-Uncensored-HF
option.s3url=s3://COPY_FROM_TRAINING_SCRIPT/

In [None]:
# Construct code artifacts tar
code_tarname = 'llama2-qlora-merged-ds'

!mkdir -p {code_tarname}
!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints

!mv model.py {code_tarname}/
!mv requirements.txt {code_tarname}/
!mv serving.properties {code_tarname}/
!tar czvf {code_tarname}.tar.gz {code_tarname}/

In [None]:
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", 
                                    default_bucket, 
                                    sagemaker.utils.name_from_base("tmp/v0"))

In [None]:
# Specify a inference container version, 
# - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers
inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"

# name a SageMaker Endpoint
endpoint_name = sagemaker.utils.name_from_base(code_tarname)

In [None]:
from sagemaker.model import Model

model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

In [None]:
model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g4dn.xlarge', 
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 900
            )

#### Init predictor and invoke specified endpoint

In [None]:
from sagemaker import serializers, deserializers

# Or copy endpoint name from SageMaker console for direct invocation
# endpoint_name = 'llama2-merge-model-2023-08-19-04-42-02-574'

predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [None]:
predictor.predict(
    {"inputs": ["tuna sandwich nutritional content is ", "I need to cook a good pizza, so "], 
     "parameters": {"max_new_tokens": 200}}
)

In [None]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200}}
)