# SageMaker Endpoint - huggingface

## 0. SageMaker Endpoint 에 HuggingFace model 배포하기 (5분 소요)

In [None]:
HUGGING_FACE_HUB_TOKEN="[YOUR TOKEN]"

In [None]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID':'meta-llama/Llama-3.2-1B-Instruct',
    'SM_NUM_GPUS': json.dumps(1),
    'HUGGING_FACE_HUB_TOKEN': HUGGING_FACE_HUB_TOKEN
}

assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    image_uri=get_huggingface_llm_image_uri("huggingface",version="2.2.0"),
    env=hub,
    role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",
    container_startup_health_check_timeout=300,
  )
  
# send request
predictor.predict({
    "inputs": "Hey my name is Julien! How are you?",
})

In [None]:
endpoint_name = predictor.endpoint_name
print(endpoint_name)

## 1. boto3 를 이용해서 SageMaker Endpoint 에 배포된 모델 호출

In [None]:
import json
import boto3
import datetime

region = boto3.Session().region_name
sagemaker_runtime = boto3.client(
    "sagemaker-runtime",
    region_name=region
)

In [None]:
client = boto3.client("sagemaker-runtime", region_name='us-west-2')

system ="You are a helpful assistant" 
text = "Hello! Who are you?"
prompt=f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system}|eot_id|><|start_header_id|>user<|end_header_id|>

{text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
payload = {
    "inputs": prompt,
    "parameters": {
        "max_new_tokens": 128,
        "top_p": 0.9,
        "temperature": 0.9,
        "return_full_text": False
    }
}

time_to_first_token = -1
start_time = datetime.datetime.now()

response = client.invoke_endpoint(
    EndpointName=endpoint_name,
#    EndpointName="huggingface-pytorch-tgi-inference-2024-11-20-16-08-14-985",
    ContentType="application/json",
    Body=json.dumps(payload),
)

# 응답 처리
end_time = datetime.datetime.now()
response_body = json.loads(response['Body'].read().decode())

# 응답 시간 계산
total_time = (end_time - start_time).total_seconds()

# 결과 출력
print(f"# System\n- {system}\n")
print(f"# Text\n- {text}\n")
print(f"# Prompt\n- {prompt}")
print(f"# Response\n- " + response_body[0]["generated_text"] + "\n")
print(f"# Total time\n- {total_time:.2f} seconds\n")

## 2. SageMaker Endpoint 삭제

In [None]:
# predictor.delete_predictor()