## LLAVA on SageMaker



In [None]:
%store -r

In [None]:
import boto3
import sagemaker
from sagemaker.utils import name_from_base
from sagemaker import image_uris
import jinja2
from pathlib import Path

In [None]:
llm_engine = "deepspeed"
# llm_engine = "fastertransformer"

In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
sm_client = sagemaker_session.sagemaker_client
sm_runtime_client = sagemaker_session.sagemaker_runtime_client
s3_client = boto3.client('s3')
jinja_env = jinja2.Environment()
default_bucket = sagemaker_session.default_bucket()

In [None]:
!aws s3 cp /home/ec2-user/SageMaker/first_pic/models/first/model.pth s3://sagemaker-us-west-2-726335585155/model/model_first/

In [None]:
framework_name = f"djl-{llm_engine}"
inference_image_uri = image_uris.retrieve(
    framework=framework_name, region=sagemaker_session.boto_session.region_name, version="0.23.0"
)

print(f"Inference container uri: {inference_image_uri}")

In [None]:
s3_url = "s3://sagemaker-us-west-2-726335585155/model/model_first/"

In [None]:
%%writefile vit-src/serving.properties
engine=DeepSpeed
option.batch_size=16
#option.s3url=s3://sagemaker-us-west-2-726335585155/sagemaker-checkpoint-test/checkpoints-0529-v2-10
option.model_id = s3://sagemaker-us-west-2-726335585155/model/model_first/

In [None]:
# we plug in the appropriate model location into our `serving.properties` file based on the region in which this notebook is running
!pygmentize vit-src/serving.properties | cat -n

In [None]:
s3_target = f"s3://{sagemaker_session.default_bucket()}/code/first_pic/"
print(s3_target)

In [None]:
!rm vit-src.tar.gz
!tar zcvf vit-src.tar.gz vit-src --exclude ".ipynb_checkpoints" --exclude "__pycache__" --exclude ".ipynb"
!aws s3 cp vit-src.tar.gz {s3_target}

In [None]:
model_uri = f"{s3_target}vit-src.tar.gz"
print(model_uri)

### 4.2 Create SageMaker endpoint

You need to specify the instance to use and endpoint names

In [None]:
from sagemaker import Model, image_uris, serializers, deserializers

model = Model(image_uri=inference_image_uri, model_data=model_uri, role=role)

instance_type = "ml.g5.xlarge"
endpoint_name = sagemaker.utils.name_from_base("firstpic-4batch")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name
            )

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
)

### invoke endpoint


In [None]:
%%time

prompt = "Describe the image"
data = {
    "input_image" : 'https://raw.githubusercontent.com/haotian-liu/LLaVA/main/images/llava_logo.png'
}

# request
output = predictor.predict(data)
print(output)

In [None]:
%%time
import time
from tqdm import tqdm

# request
t0=time.time()
for i in tqdm(range(1000)):
    output = predictor.predict(data)
cost_time = time.time()-t0
print(cost_time)
print(output)

In [None]:
## multi process
# 压力测试, 多线程

import time
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


max_workers=64

def test_function(i):
    global data
    res = predictor.predict(data)

t0 = time.time()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(test_function, i) for i in range(1000)]
    for future in tqdm(futures):
        future.result()
t1 = time.time()
dt = t1-t0


print (f"average time per 1000 image",dt )
print ("串行推理千张成本 - 1000 pic infer cost: ", dt/60/60*1.408)

## delete endpoint

In [None]:
# delete sagemaker endpoint
predictor.delete_endpoint()