In [1]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

In [None]:
code_tarname = 'acc-baichuan7b-model'
endpoint_name = sagemaker.utils.name_from_base(code_tarname)

!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

# copy the deployment configs tar to a path (different from hf model artifacts)
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", default_bucket, sagemaker.utils.name_from_base("tmp0625/v1"))
print(s3_code_artifact)

In [4]:
# specify a inference container version, found at: 
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers

inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"

In [None]:
from sagemaker.model import Model

model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g5.4xlarge', 
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 900
            )

--

## Predict

- Use the 'endpoint_name' variable defined above, or directly copy from SageMaker console
- ```endpoint_name = 'xxxxx'```

In [35]:
from sagemaker import serializers, deserializers

# endpoint_name = 'xxxxx'

predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [26]:
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)

{'response': 'tuna sandwich nutritional content is 10.4 g, the amount of Potassium is 25 mg and the amount of Calories is 36 kcal.\nThe difference between the highest and lowest value gives a potassium range of 79 to 80 mg per 100g. The range for other nutrients are as follows; 0 g for Protein, 0 g for Fat, 0 g for Carbohydrate, 0 g for Sugar, 0 g for Sodium.'}

In [27]:
resp = predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)
print(resp['response'])

tuna sandwich nutritional content is 10.4 g, the amount of Potassium is 25 mg and the amount of Calories is 36 kcal.
The difference between the highest and lowest value gives a potassium range of 79 to 80 mg per 100g. The range for other nutrients are as follows; 0 g for Protein, 0 g for Fat, 0 g for Carbohydrate, 0 g for Sugar, 0 g for Sodium.


In [28]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)

3.76 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)


In [31]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "tuna sandwich nutritional content is ", 
     "parameters": {"max_new_tokens": 200, "repetition_penalty":1.1}
     }
)

3.75 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)
