In [10]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

In [11]:
code_tarname = 'acc-baichuan13b-model'
endpoint_name = sagemaker.utils.name_from_base(code_tarname)

!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

# copy the deployment configs tar to a path (different from hf model artifacts)
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", default_bucket, sagemaker.utils.name_from_base("tmp0717/v1"))
print(s3_code_artifact)

acc-baichuan13b-model/
acc-baichuan13b-model/serving.properties
acc-baichuan13b-model/requirements.txt
acc-baichuan13b-model/model.py
s3://sagemaker-us-east-1-633205212955/tmp0717/v1-2023-07-17-15-00-41-971/acc-baichuan13b-model.tar.gz


In [12]:
# specify a inference container version, found at: 
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"

In [13]:
from sagemaker.model import Model

model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g5.12xlarge',
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 900
            )

---------------!

## Predict

- Use the 'endpoint_name' variable defined above, or directly copy from SageMaker console
- ```endpoint_name = 'xxxxx'```

In [14]:
from sagemaker import serializers, deserializers

# endpoint_name = 'xxxxx'

predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [15]:
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)

{'response': "tuna sandwich nutritional content is 1.23 grams of protein per serving, which isn't much at all for a high-protein meal like this one!\nThe best part about the Tuna Sandwich recipe? It only takes five minutes to make and it can be made in advance so you don't have to worry if your lunch gets cold or soggy while sitting on top of that hot desk chair (or worse yet - underneath) during work hours when there are no microwave ovens available nearby :) The ingredients used here include: canned white albacore fish packed with water; mayonnaise mixed together from scratch using just three simple pantry staples including olive oil & lemon juice as well as salt/pepper seasonings added into taste preference ; whole wheat bread slices cut diagonally across each other before placing them onto baking sheet lined up next to another slice already placed down first then topped off by two more pieces afterwards until they reach full capacity without any"}

In [17]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)

15.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)
