In [2]:
# from PIL import Image
import base64
import json
import boto3
from pathlib import Path
import time
import random
from PIL import Image
from io import BytesIO

# 这是一个启动了Rolling Batch的vLLM的Sagemaker Endpint
endpoint_name = 'endpoint-custome-llava-v16-2024-11-21-09-32-34' 


smr_client = boto3.client("sagemaker-runtime")

# Function to encode the image
def encode_image(image_path, scale_size):
    try:
        # 打开图像文件
        with Image.open(image_path) as img:
            # 缩放图像
            img = img.resize(scale_size)

            # 创建BytesIO对象
            buffer = BytesIO()

            # 将缩放后的图像保存到BytesIO对象中
            img.save(buffer, format="PNG")

            # 获取图像数据的字节序列
            img_bytes = buffer.getvalue()

            # 计算Base64编码
            base64_encoded = base64.b64encode(img_bytes).decode('utf-8')

            return base64_encoded
    except Exception as e:
        print(f"Error: {e}")
        return None

def run_inference(endpoint_name, inputs):
    response = smr_client.invoke_endpoint(
        EndpointName=endpoint_name, Body=json.dumps(inputs)
    )
    return response["Body"].read().decode('utf-8')

def call_sagemaker_llava(input_text, system_prompt, temperature, top_p, top_k, scale_size=(400, 600), input_image_paths=None, input_images=None):
    
    content_images = []
    if Path(input_image_paths).is_file():
        #print("file path is ", input_image_paths)
        content_images.append(encode_image(input_image_paths, scale_size))
    elif Path(input_image_paths).is_dir():
        print("dir path is ", input_image_paths)
        for input_image_path in Path(input_image_paths).glob('*.jpg'):
            content_images.append(encode_image(input_image_path, scale_size))
    
    #print(len(content_images))
    
    prompt = "# system_prompt  \n" + system_prompt + "\n===============\n # user_input  \n" + input_text
    
    content = [{"type": "text", "text": prompt}]
    
    content += [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} for base64_image in content_images]
    
    # content += [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} for base64_image in [1, 2]]

    # print(content)

    inputs = {
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "max_tokens":512,
        "temperature": temperature, 
        "top_p": top_p
      }
    
    t0 = time.time()
    response = run_inference(endpoint_name, inputs)
    t1 = time.time()
    # print(response)
    try:
        outputs = json.loads(response)["choices"][0]["message"]["content"]
        usage = json.loads(response)["usage"]
        return outputs, usage, t1-t0
    # response["output"]["message"]["content"][0]["text"]
    except:
        return response

In [4]:
system_prompt = ""
user_input = """
Fill in the blank: this is a photo of a {}
"""
image_path = "../data0527/img/0.jpg"
t0 = time.time()
r = call_sagemaker_llava(user_input, system_prompt, 0.5, 0.7, 100, scale_size=(180, 320), input_image_paths=image_path, input_images=None)
t1 = time.time()
print(r, t1-t0)

(' Yes, the image shows a hotel with multiple rooms, as indicated by the presence of numerous balconies and the overall layout of the building. ', {'prompt_tokens': 1675, 'completion_tokens': 30, 'total_tokens': 1705}, 1.5002124309539795) 1.536064863204956


# Single Thread

In [6]:
# 压力测试, 单线程1

import time

from tqdm import tqdm

total_input_token = 0
total_output_token = 0

t0 = time.time()
for i in tqdm(range(50)):
    h, l = random.randint(200, 400), random.randint(200, 400)
    _, r, _ = call_sagemaker_llava(user_input, system_prompt, 0.5, 0.7, 100, scale_size=(h, l), input_image_paths=image_path, input_images=None)
    total_input_token += r["prompt_tokens"]
    total_output_token += r["completion_tokens"]
t1 = time.time()
dt = t1-t0

it = total_input_token/dt
ot = total_output_token/dt

print(f"average input: {it} token/s, average output: {ot} token/s", it, ot)
print (f"average time per 1000 image",dt/50*1000 )

100%|██████████| 50/50 [00:57<00:00,  1.15s/it]

average input: 1175.0676453471767 token/s, average output: 18.235958122834656 token/s 1175.0676453471767 18.235958122834656
average time per 1000 image 1154.8611736297607





In [11]:
# price calculate

instance_price = 1.515 # usd/h

claude_3_Haiku_price_input_token, claude_3_Haiku_price_output_token = 0.00025, 0.00125 # usd/1k token

price_input_token_1h = it * 3600 / 1000 * claude_3_Haiku_price_input_token
price_output_token_1h = ot * 3600 / 1000 * claude_3_Haiku_price_output_token
Haiku_price_total = price_input_token_1h + price_output_token_1h

print(price_input_token_1h, price_output_token_1h, Haiku_price_total)

0.38663222829620314 0.11475746786428097 0.5013896961604841


In [12]:
# 压力测试, 单线程2

import time

from tqdm import tqdm

total_input_token = 0
total_output_token = 0
total_t = 0

t0 = time.time()
for i in tqdm(range(10)):
    # h, l = random.randint(400, 600), random.randint(600, 800)
    h, l = 360, 640
    _, r, t = call_sagemaker_llava(user_input, system_prompt, 0.1, 0.1, 100, scale_size=(h, l), input_image_paths=image_path, input_images=None)
    total_input_token += r["prompt_tokens"]
    total_output_token += r["completion_tokens"]
    total_t += t
t1 = time.time()

it = total_input_token/total_t
ot = total_output_token/total_t

print(f"average input: {it} token/s, average output: {ot} token/s", it, ot)

100%|██████████| 10/10 [00:32<00:00,  3.25s/it]

average input: 676.1326171011988 token/s, average output: 23.358456426465317 token/s 676.1326171011988 23.358456426465317





In [13]:
# price calculate

instance_price = 1.515 # usd/h

claude_3_Haiku_price_input_token, claude_3_Haiku_price_output_token = 0.00025, 0.00125 # usd/1k token

price_input_token_1h = it * 3600 / 1000 * claude_3_Haiku_price_input_token
price_output_token_1h = ot * 3600 / 1000 * claude_3_Haiku_price_output_token
Haiku_price_total = price_input_token_1h + price_output_token_1h

print(price_input_token_1h, price_output_token_1h, Haiku_price_total)

0.6085193553910789 0.10511305391909392 0.7136324093101729


In [14]:
# price calculate

instance_price = 1.19 # usd/h

claude_3_Haiku_price_input_token, claude_3_Haiku_price_output_token = 0.00025, 0.00125 # usd/1k token

price_input_token_1h = it * 3600 / 1000 * claude_3_Haiku_price_input_token
price_output_token_1h = ot * 3600 / 1000 * claude_3_Haiku_price_output_token
Haiku_price_total = price_input_token_1h + price_output_token_1h

print(price_input_token_1h, price_output_token_1h, Haiku_price_total)

0.6085193553910789 0.10511305391909392 0.7136324093101729


# Multi-Thread

修改max_workers

In [10]:
# 压力测试, 多线程

import time
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

total_input_token = 0
total_output_token = 0
max_workers=64

def test_function(i):
    global total_input_token, total_output_token
    h, l = random.randint(200, 400), random.randint(200, 400)
    _, r, _ = call_sagemaker_llava(user_input, system_prompt, 0.5, 0.7, 100, input_image_paths=image_path, input_images=None)
    total_input_token += r["prompt_tokens"]
    total_output_token += r["completion_tokens"]

t0 = time.time()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(test_function, i) for i in range(50)]
    for future in tqdm(futures):
        future.result()
t1 = time.time()
dt = t1-t0

it = total_input_token/dt
ot = total_output_token/dt

print(f"average input: {it} token/s, average output: {ot} token/s", it, ot)
print (f"average time per 1000 image",dt/50*1000 )

100%|██████████| 50/50 [00:36<00:00,  1.37it/s]

average input: 2957.0911276148563 token/s, average output: 30.092987120317776 token/s 2957.0911276148563 30.092987120317776
average time per 1000 image 747.0178985595703





In [36]:
dt

47.812429666519165