In [1]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
default_bucket = sess.default_bucket()  # bucket to house artifacts
region = sess._region_name

In [2]:
code_tarname = 'acc_blmz_model_defaut'

In [3]:
!rm -rf {code_tarname}.tar.gz
!rm -rf {code_tarname}/.ipynb_checkpoints
!tar czvf {code_tarname}.tar.gz {code_tarname}/

# copy the deployment configs tar to a path (different from hf model artifacts)
s3_code_artifact = sess.upload_data(f"{code_tarname}.tar.gz", default_bucket, sagemaker.utils.name_from_base("tmp0615/v2"))
print(s3_code_artifact)

acc_blmz_model_defaut/
acc_blmz_model_defaut/serving.properties
s3://sagemaker-us-east-1-633205212955/tmp0615/v2-2023-06-16-00-05-06-609/acc_blmz_model_defaut.tar.gz


In [4]:
from sagemaker.model import Model
from sagemaker import serializers, deserializers
from sagemaker import image_uris

# specify a inference container version, found at https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers
inference_image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"

endpoint_name = sagemaker.utils.name_from_base(code_tarname.replace('_','-'))

In [5]:
model = Model(image_uri=inference_image_uri,
              model_data=s3_code_artifact, 
              role=role)

model.deploy(initial_instance_count = 1,
             instance_type = 'ml.g5.2xlarge',
             endpoint_name = endpoint_name,
             container_startup_health_check_timeout = 900
            )

---------------!

In [6]:
predictor = sagemaker.Predictor(
            endpoint_name=endpoint_name,
            sagemaker_session=sess,
            serializer=serializers.JSONSerializer(),
            deserializer=deserializers.JSONDeserializer(),
            )

In [7]:
predictor.predict(
    {"inputs": "Translate to English: Je t’aime.", "parameters": {"max_new_tokens": 100}}
)

[{'generated_text': 'Translate to English: Je t’aime. I love you.'}]

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (424) from primary with message "{
  "code":424,
  "message":"prediction failure",
  "error":"CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 22.20 GiB total capacity; 20.45 GiB already allocated; 173.12 MiB free; 20.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
}"

In [10]:
predictor.predict(
    {"inputs": "tuna sandwich detailed nutritional content is ", "parameters": {"max_new_tokens": 30}}
)

[{'generated_text': 'tuna sandwich detailed nutritional content is  - calories , fat , protein , carbohydrates , sodium , cholesterol , sugar , fiber , vitamin a , vitamin b , vitamin c , vitamin'}]

In [11]:
%%timeit -n3 -r1
predictor.predict(
    {"inputs": "tuna sandwich detailed nutritional content is ", "parameters": {"max_new_tokens": 30}}
)

17.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 3 loops each)
