In [1]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

## Edit code artifacts

In [1]:
# use huggingface model_id or s3url
!cat acc_wizard_model/serving.properties

engine=Python
option.tensor_parallel_degree=1
#option.model_id=TheBloke/Wizard-Vicuna-7B-Uncensored-HF
option.s3url=s3://XXXXX/XXXXX/

## Construct artifacts and deploy to SageMaker endpoint

In [None]:
# Construct code artifacts tar
code_tarname = 'acc_wizard_model'

!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

In [4]:
# specify a inference container version, 
# form - https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers
inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.8.3-cu118"

# copy the code tar to 'any' valid S3 path (different from hf model artifacts), use default bucket here
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", 
                                    default_bucket, 
                                    sagemaker.utils.name_from_base("tmp06/v2"))

# name a SageMaker Endpoint
endpoint_name = sagemaker.utils.name_from_base(code_tarname.replace('_','-'))

In [5]:
from sagemaker.model import Model

model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g5.2xlarge', 
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 900
            )

-------------!

## Wrap a predictor and request specified endpoint

In [6]:
from sagemaker import serializers, deserializers

# endpoint_name = 'OR copy from SageMaker console'

predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [14]:
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", "parameters": {"max_new_tokens": 100}}
)

[{'generated_text': 'tuna sandwich nutritional content is 220 calories, 12g fat, 3g saturated fat, 0mg cholesterol, 1g protein, 2g carbohydrate, 0g dietary fiber, 350mg sodium.'}]

In [17]:
predictor.predict(
    {"inputs": "in order to make a good pizza, i need to ", "parameters": {"max_new_tokens": 200}}
)

[{'generated_text': 'in order to make a good pizza, i need to \n\n1. Preheat the oven to 450°F (230°C).\n2. Roll out the dough on a floured surface to a thickness of 1/8 inch (3 mm).\n3. Transfer the dough to a pizza pan or baking sheet.\n4. Spread the sauce evenly over the dough.\n5. Add the toppings of your choice.\n6. Bake for 10-12 minutes or until the crust is golden brown and the cheese is melted and bubbly.\n7. Remove from the oven and let cool for a few minutes before slicing and serving.\n\nI hope this helps! Let me know if you have any other questions.'}]

In [18]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "in order to make a good pizza, i need to ", "parameters": {"max_new_tokens": 200}}
)

6.73 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)
