In [2]:
!pip install "sagemaker>=2.175.0" --upgrade --quiet

[0m

In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::890717383483:role/service-role/AmazonSageMaker-ExecutionRole-20231006T171691
sagemaker session region: us-east-1


In [4]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.0.3"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04


In [5]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  #'HF_MODEL_ID': "ziqingyang/chinese-llama-2-7b", # model_id from hf.co/models jingamz/finetuningllama2
  #'HF_MODEL_ID': "jingamz/llama2ec2",  
  #'HF_MODEL_ID': "jingamz/finetuningllama2",
  #'HF_MODEL_ID': "ziqingyang/chinese-alpaca-2-7b",
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf",  
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': "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"
  # ,'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# check if token is set
#assert config['HUGGING_FACE_HUB_TOKEN'] != "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", "Please set your Hugging Face Hub token"

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [6]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

------------!

In [7]:
def build_llama2_prompt(messages):
    startPrompt = "<s>[INST] "
    endPrompt = " [/INST]"
    conversation = []
    for index, message in enumerate(messages):
        if message["role"] == "system" and index == 0:
            conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
        elif message["role"] == "user":
            conversation.append(message["content"].strip())
        else:
            conversation.append(f" [/INST] {message['content'].strip()} </s><s>[INST] ")

    return startPrompt + "".join(conversation) + endPrompt
  
messages = [
  { "role": "system","content": "你是AWS云计算专家"}
]

In [9]:
# define question and add to messages
instruction = "什么是基于 vCPU 的限制？"
messages.append({"role": "user", "content": instruction})
prompt = build_llama2_prompt(messages)

print(prompt)

chat = llm.predict({"inputs":prompt})

print(chat[0]["generated_text"][len(prompt):])

<s>[INST] <<SYS>>
你是AWS云计算专家
<</SYS>>

什么是基于 vCPU 的限制？什么是基于 vCPU 的限制？ [/INST]
[{'generated_text': '<s>[INST] <<SYS>>\n你是AWS云计算专家\n<</SYS>>\n\n什么是基于 vCPU 的限制？什么是基于 vCPU 的限制？ [/INST] As an AWS cloud computing expert, I can explain to you what vCPU-based limits'}]
 As an AWS cloud computing expert, I can explain to you what vCPU-based limits


In [11]:
# hyperparameters for llm
payload = {
  "inputs":  prompt,
  "parameters": {
    # "do_sample": True,
    # "top_p": 0.6,
    # "temperature": 1,
    # "top_k": 50,
    "max_new_tokens": 1024,
    # "repetition_penalty": 1.03,
    "stop": ["</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)
print(payload)
print(response)
print(response[0]["generated_text"][len(prompt):])

{'inputs': '<s>[INST] <<SYS>>\n你是AWS云计算专家\n<</SYS>>\n\n什么是基于 vCPU 的限制？什么是基于 vCPU 的限制？ [/INST]', 'parameters': {'max_new_tokens': 1024, 'stop': ['</s>']}}
[{'generated_text': '<s>[INST] <<SYS>>\n你是AWS云计算专家\n<</SYS>>\n\n什么是基于 vCPU 的限制？什么是基于 vCPU 的限制？ [/INST] As an AWS cloud computing expert, I can explain to you what vCPU-based limits are in AWS.\nvCPU-based limits refer to the restrictions placed on the number of virtual central processing units (vCPUs) that can be used by a specific AWS service or resource. vCPUs are virtualized CPU cores that are allocated to a virtual machine (VM) or container instance, allowing multiple virtual instances to run on a single physical host.\nIn AWS, vCPU-based limits are put in place to ensure that the resources are used efficiently and to prevent over-provisioning of resources, which can lead to increased costs and reduced performance. These limits are applied at the instance level and are based on the type of instance and the region in which it is ru

In [12]:
llm.delete_endpoint()