# Setting

In [1]:
import json
import boto3
import datetime

sagemaker_runtime = boto3.client(
    "sagemaker-runtime",
    region_name='us-west-2'
)

endpoint_name='llama-3-1-8b-instruct-2024-09-04-15-37-21-104'

# Invoke

In [None]:
client = boto3.client("sagemaker-runtime", region_name='us-west-2')

prompt = "Who am I?"
payload = {
    "inputs": prompt,
    "parameters": {
        "max_new_tokens": 128,
        "top_p": 0.9,
        "temperature": 0.6,
        # "stream": True,
        # "return_full_text": False
    }
}

time_to_first_token = -1
start_time = datetime.datetime.now()

response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps(payload),
)

# 응답 처리
end_time = datetime.datetime.now()
response_body = json.loads(response['Body'].read().decode())

# 응답 시간 계산
total_time = (end_time - start_time).total_seconds()

# 결과 출력
print(f"Prompt: {prompt}")
print(f"Response: {response_body}")
print(f"Total time: {total_time:.2f} seconds")

# Streaming Invoke

In [2]:
class JSONStreamParser:
    def __init__(self):
        self.buffer = ""

    def add_chunk(self, chunk):
        self.buffer += chunk.decode('utf-8')

        if not self.buffer or self.buffer[-1] != '\n':
            return
        while True:
            try:
                obj, idx = json.JSONDecoder().raw_decode(self.buffer)
                yield obj
                self.buffer = self.buffer[idx:].lstrip()
            except json.JSONDecodeError:
                break

In [3]:
parser = JSONStreamParser()

def stream_response(client, endpoint_name, payload):
    response = client.invoke_endpoint_with_response_stream(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Body=json.dumps(payload),
    )
    
    for event in response['Body']:
        if 'PayloadPart' in event:
            for obj in parser.add_chunk(event['PayloadPart']['Bytes']):
                yield obj['token']['text']

In [4]:
sagemaker_runtime = boto3.client("sagemaker-runtime", region_name='us-west-2')

prompt = "Who am I?"
payload = {
    "inputs": prompt,
    "parameters": {
        "max_new_tokens": 128,
        "top_p": 0.9,
        "temperature": 0.6,
        "stream": True,
        # "return_full_text": False
    }
}

time_to_first_token = -1
start_time = datetime.datetime.now()

for chunk in stream_response(sagemaker_runtime, endpoint_name, payload):
    if time_to_first_token < 0:
        first_token_time = datetime.datetime.now()
        time_to_first_token = (first_token_time - start_time).total_seconds()
        print(f"# Time taken for first token: {time_to_first_token} seconds")
        print()
    print(chunk, end='', flush=True)

# Time taken for first token: 0.138198 seconds

 I am a 30-year-old woman who has been married for 10 years. I have two beautiful children, a boy and a girl, aged 7 and 4. I work part-time as a teacher and I love my job. I am a bit of a homebody and enjoy spending time with my family and friends. I love to cook and try out new recipes. I am a bit of a worrier and can get anxious at times, but I am working on it. I am a Christian and my faith is very important to me. I love to read and learn new things. I am a bit of a perfection