In [16]:
import boto3
boto3.client('sts').get_caller_identity()

{'UserId': 'AROAVFIZRTJ4K6KJIOWUR:SageMaker',
 'Account': '354925255288',
 'Arn': 'arn:aws:sts::354925255288:assumed-role/test/SageMaker',
 'ResponseMetadata': {'RequestId': '5459cbb4-42a0-4ef2-aca3-9273e5bedb92',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5459cbb4-42a0-4ef2-aca3-9273e5bedb92',
   'content-type': 'text/xml',
   'content-length': '429',
   'date': 'Wed, 09 Aug 2023 14:33:35 GMT'},
  'RetryAttempts': 0}}

In [1]:
!pip install -U sagemaker

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker role arn: arn:aws:iam::354925255288:role/test
sagemaker session region: us-east-1


In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.9.3"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")


llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04


In [4]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.48xlarge"
number_of_gpu = 8
health_check_timeout = 2400

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-70b-chat-hf", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(2048),  
  'MAX_BATCH_PREFILL_TOKENS': json.dumps(1024),  
  'HUGGING_FACE_HUB_TOKEN': "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"
  # ,'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# check if token is set
assert config['HUGGING_FACE_HUB_TOKEN'] != "<REPLACE WITH YOUR TOKEN>", "Please set your Hugging Face Hub token"

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)


In [5]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)



---------------!

In [11]:
def build_llama2_prompt(messages):
    startPrompt = "<s>[INST] "
    endPrompt = " [/INST]"
    conversation = []
    for index, message in enumerate(messages):
        if message["role"] == "system" and index == 0:
            conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
        elif message["role"] == "user":
            conversation.append(message["content"].strip())
        else:
            conversation.append(f" [/INST] {message.content}</s><s>[INST] ")

    return startPrompt + "".join(conversation) + endPrompt



In [24]:
instruction = '''
Assign appropriate labels/tags to the product as "tags", in flavor, brand, key ingredient, and package size 
(if applicable), etc.  

product: {product}

1. Return the results in JSON format with the following key: "tags".
2. Replied answer should be as diverse as possible.
3. Do not repeat answers.
4. Reply in Taiwan Chinese.
5. Please avoid choosing duplicate tags.
6. Your tags shall be no more than {max_tags}.
7. Reply tag json only, no more information.

The json result is: 
'''


In [30]:
messages = [
  { "role": "system","content": instruction.format(product='統一蜜豆奶', max_tags=4)}
]


In [32]:
llm

<sagemaker.huggingface.model.HuggingFacePredictor at 0x7f77b3cf51e0>

In [31]:
# define question and add to messages
# instruction = "What are some cool ideas to do in the summer?"
# messages.append({"role": "user", "content": instruction})
prompt = build_llama2_prompt(messages)

chat = llm.predict({"inputs":prompt})

print(chat[0]["generated_text"][len(prompt):])


 {
"tags": [
"蜜豆奶",



In [33]:
# hyperparameters for llm
payload = {
  "inputs":  prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.01,
    "top_k": 50,
    "max_new_tokens": 100,
    "repetition_penalty": 1.03,
    "stop": ["</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

print(response[0]["generated_text"][len(prompt):])


 {
"tags": [
"蜜豆奶",
"統一",
"牛奶",
"甜點"
]
}


In [27]:
# define question and add to messages
# instruction = "What are some cool ideas to do in the summer?"
# messages.append({"role": "user", "content": instruction})
content = '說中文 台灣中文！'
messages = [
  { "role": "system","content": content}
]

prompt = build_llama2_prompt(messages)

chat = llm.predict({"inputs": prompt})

print(chat[0]["generated_text"][len(prompt):])


 Sure, I can speak Taiwanese Hokkien. Here we go:

�


In [28]:
# hyperparameters for llm
payload = {
  "inputs":  prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.01,
    "top_k": 50,
    "max_new_tokens": 100,
    "repetition_penalty": 1.03,
    "stop": ["</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

print(response[0]["generated_text"][len(prompt):])


 Sure, I can speak Taiwanese Hokkien. Here we go:

嗨！我是機器人。我可以說中文，包括台灣的中文。

（hēi！wǒ shì jī xīng zhī. wǒ kě yǐ xiǎng zhōng wén, bāo gòu tái wān de
