In [1]:
%pip install sagemaker --upgrade  --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [11]:
!aws s3 ls s3://sagemaker-us-west-2-726335585155/hf_home/Meta-Llama-3-8B-Instruct-v3/checkpoint-12000/

                           PRE global_step12000/
2024-06-13 15:43:11        713 config.json
2024-06-13 15:43:11        194 generation_config.json
2024-06-13 15:43:53         16 latest
2024-06-13 15:43:17 4976698672 model-00001-of-00004.safetensors
2024-06-13 15:43:23 4999802720 model-00002-of-00004.safetensors
2024-06-13 15:43:30 4915916176 model-00003-of-00004.safetensors
2024-06-13 15:43:31 1168138808 model-00004-of-00004.safetensors
2024-06-13 15:43:31      23950 model.safetensors.index.json
2024-06-13 15:43:51      15984 rng_state_0.pth
2024-06-13 15:43:51      15984 rng_state_1.pth
2024-06-13 15:43:51      15984 rng_state_2.pth
2024-06-13 15:43:52      15984 rng_state_3.pth
2024-06-13 15:43:51      15984 rng_state_4.pth
2024-06-13 15:43:52      15984 rng_state_5.pth
2024-06-13 15:43:52      15984 rng_state_6.pth
2024-06-13 15:43:51      15984 rng_state_7.pth
2024-06-13 15:43:51       1064 scheduler.pt
2024-06-13 15:43:31        325 special_tokens_map.json
2024-06-13 15:43:31    90

## Step 2: Start preparing model artifacts
In LMI contianer, we expect some artifacts to help setting up the model
- serving.properties (required): Defines the model server settings
- model.py (optional): A python file to define the core inference logic
- requirements.txt (optional): Any additional pip wheel need to install

In [23]:
%%writefile serving.properties
engine=Python
option.model_id= s3://sagemaker-us-west-2-726335585155/hf_home/Meta-Llama-3-8B-Instruct-v3/checkpoint-12000/
option.dtype=fp16
option.task=text-generation
option.rolling_batch=vllm
option.tensor_parallel_degree=1
option.device_map=auto
option.max_model_len=4096
# max_rolling batch size
# 

Writing serving.properties


In [24]:
%%sh
mkdir mymodel
mv serving.properties mymodel/
tar czvf mymodel.tar.gz mymodel/
rm -rf mymodel

mymodel/
mymodel/serving.properties


## Step 3: Start building SageMaker endpoint
In this step, we will build SageMaker endpoint from scratch

### Getting the container image URI

[Large Model Inference available DLC](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#large-model-inference-containers)


In [25]:
image_uri = image_uris.retrieve(
        framework="djl-deepspeed",
        region=sess.boto_session.region_name,
        version="0.26.0"
    )

### Upload artifact on S3 and create SageMaker model

In [26]:
s3_code_prefix = "yafei/llama3"
bucket = sess.default_bucket()  # bucket to house artifacts
code_artifact = sess.upload_data("mymodel.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-726335585155/yafei/llama3/mymodel.tar.gz


### 4.2 Create SageMaker endpoint

You need to specify the instance to use and endpoint names

In [None]:
instance_type = "ml.g5.2xlarge"
endpoint_name = sagemaker.utils.name_from_base("yafei-llama3")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name,
             container_startup_health_check_timeout=3600
            )

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
)

## Step 5: Test and benchmark the inference

In [18]:
input_text = '''
You are a senior cross-border e-commerce product consultant, your task is write keyword based on the title, description provided by the user, answer in format a dictionary of[keyword_type: keyword]. the input title is Cat Scat Mat Spike Anti-Cat Dog Pest Deterrent Garden Repellent Animal Scarer, input product description is Features: *Avoid cats,dogs and other animals destroyed vegetables, plants and so on. *Humane solution for keeping cats and dogs from digging. *Plastic spike will keep cats away but doesn&#39;t harm the cats. *Be networked into user-configurable sizes and lengths. *It can also be wrapped around poles or trees to stop animals from climbing. *The anti-cat mat can avoid cats, dogs and other animals from destroying vegetables and plants. * Plastic nails will keep the cat away, but will not harm the cat. * The anti-cat mat can also be wrapped around poles or trees to prevent animals from climbing. Purpose: 1. Spread around the flowerpot to prevent cats from "invading" 2. Spread in the flowerpot to prevent cats from digging soil It is easy to set up and can be cut into any size with scissors. You can use it wherever you want. The connection is also simple, and it can be connected horizontally and vertically. Specifications: *Material: PP *Color:Black，White *Size: about ：400x30 cm *Note: 30cm plus the ports on both sides Package Contents: 1PCS Cat Scat Mat Note: 1. The real color of the item may be slightly different from the pictures shown on website caused by many factors such as brightness of your monitor and light brightness. 2. Please allow slight manual measurement deviation for the data. , what's the keywords? Present the answer in a dictionary format, where the keys are the names inside ['Primary Keywords', 'Attribute Keywords', 'Other Keywords', 'Synonyms'] and the values are lists containing the keywords.'''

In [19]:
import json
res=predictor.predict(
    {"inputs": input_text, "parameters": {"max_tokens":4096}}
)
response = json.loads(res)
response['generated_text']

'output result is {"Primary Keywords": ["Cat Scat Mat", "Schutzkotplay Mat"], "Attribute Keywords": ["1pcs", "400x30 Cm", "Anti-Cat", "Animal", "Deterrent", "Dog", "Gardening", "Non-Toxic", "PP", "Protection", "Pest", "Repellent", "Scoat", "Scarer", " Spike", "White", "1pcs", "400x30 Cm", "Abschreckung", "Anti-Katze", "Gartenarbeit", "Hund", "PP", "Schutz", "Scarer", "Schwei\\u00dfabschreckung", "Spike", "Tier", "Ungiftig", "Verhindern"], "Other Keywords": ["Brand New", "Durable", "High Quality", "Useful", "Hohe Qualit\\u00e4t", "Langlebig", "Nagelneu", "N\\u00fctzlich"], "Synonyms": ["Mat"]}'

## batch predict

In [20]:
prompts=[]
for i in range(100):
    prompts.append(input_text)

In [21]:
import time
def call_endpoint(prompt):
    input = {"inputs": prompt, "parameters": {"max_tokens":4096}}
    input = json.dumps(input)
    # start = time.time()

    response = predictor.predict(
    {"inputs": input_text, "parameters": {"max_tokens":4096}}
    )
    result = json.loads(response)['generated_text']
    return result
    # end = time.time()
    # process_time=end-start
    # print("process time:"+str(int(process_time)))
    # print(results)
    
    
from joblib import Parallel, delayed


results = Parallel(n_jobs=100, prefer='threads', verbose=1)(
    delayed(call_endpoint)(prompt)
    for prompt in prompts
)   

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done   2 out of 100 | elapsed:    6.8s remaining:  5.6min
[Parallel(n_jobs=100)]: Done 100 out of 100 | elapsed:   45.2s finished


In [22]:
results[0:10]

['output result is {"Primary Keywords": ["Cat Scat Mat", "Cartoffelmatten"], "Attribute Keywords": ["2022", "400x30 Cm", "Accessories", "Anti-Cat", "Animal", "Anti-touch", "Dog", "Deterrent", "Garden", "Humane Solution", "Mat", "PP", "Pest", "Repellent", "Scat", "Spike", "Wear-resistance", "2022", "400x30 Cm", "Abschreckung", "Anti-Katze", "Garten", "Hund", "Kannchenlampe", "Matte", "PP", "Pf\\u00e4hle", "Schutz", "Scat", "Spike", "Tier", "Touch-Schutz", "Verschlei\\u00dffest", "Zubeh\\u00f6r"], "Other Keywords": ["Durable", "New", "Practical", "Useful", "Langlebig", "Neu", "N\\u00fctzlich", "Praktisch"], "Synonyms": ["Mat"]}',
 'output result is {"Primary Keywords": ["Cat Scat Mat", "Welsch\\u00fcrze Matte"], "Attribute Keywords": ["1pcs", "400x30 Cm", "Anti-Cat", "Anti-Cat Dog Pest", "Deterrent", "Dog Pest", "Garden", "Humane Solution", "Keep Cats", "Keep Dogs", "Mat", "PP", "Parking", "Pest", "Plastic Spike", "Scat", "Spike", "1Stk", "400x30 Cm", "Anti-Katze", "Anti-Katzen-Abwehr", 

## Clean up the environment

In [None]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
model.delete_model()