In [1]:
import boto3, json, sagemaker, time
from sagemaker import get_execution_role
from pathlib import Path

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)
role = get_execution_role()
client = boto3.client("sagemaker-runtime")
region = boto3.Session().region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Download model


In [None]:
# # install ngc cli
# !wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.39.0/zip -O ngc_cli_3.39.0.zip
# !unzip ngc_cli_3.39.0.zip
# !unzip ngccli_linux.zip
# !chmod u+x ngc-cli
# !export PATH=$PATH:ngc-cli

In [None]:
# !ngc config set

In [33]:
# !ngc registry model download-version "nvidian/nemo-llm/llama-2-7b-chat:LLAMA-2-7B-CHAT-4K-FP16-1-A100.24.02.rc2"

/bin/sh: ngc: command not found


In [15]:
current_directory = Path.cwd()
path = current_directory / 'llama-2-7b-chat_vLLAMA-2-7B-CHAT-4K-FP16-1-A100.24.02.rc2/LLAMA-2-7B-CHAT-4K-FP16-1-A100.24.02.rc2.tar.gz'

In [16]:
model_uri = sagemaker_session.upload_data(path=path, key_prefix="nim")

In [4]:
model_uri = 's3://sagemaker-us-east-1-354625738399/nim/LLAMA-2-7B-CHAT-4K-FP16-1-A100.24.02.rc2.tar.gz'

In [5]:
nim_image_uri = "354625738399.dkr.ecr.us-east-1.amazonaws.com/nim-24.02-sm-final"

In [6]:
container = {
    "Image": nim_image_uri,
    "ModelDataUrl": model_uri,
    "Environment": {"SAGEMAKER_MODEL_NAME": "llama-2-7b",
                    "SAGEMAKER_NUM_GPUS": "1"}
}

In [7]:
sm_prefix = "nim-llama-2-7b-1A100-"

In [8]:
sm_model_name = sm_prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

In [9]:
create_model_response = sm.create_model(
    ModelName=sm_model_name, ExecutionRoleArn=role, PrimaryContainer=container
)

print("Model Arn: " + create_model_response["ModelArn"])

Model Arn: arn:aws:sagemaker:us-east-1:354625738399:model/symlink-nim-llama-2-7b-a100-2024-03-12-01-28-48


In [10]:
endpoint_config_name = sm_prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.p4d.24xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": sm_model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

Endpoint Config Arn: arn:aws:sagemaker:us-east-1:354625738399:endpoint-config/symlink-nim-llama-2-7b-a100-2024-03-12-01-28-48


In [11]:
endpoint_name = sm_prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_response = sm.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-east-1:354625738399:endpoint/symlink-nim-llama-2-7b-a100-2024-03-12-01-28-49


In [12]:
resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:354625738399:endpoint/symlink-nim-llama-2-7b-a100-2024-03-12-01-28-49
Status: InService


In [14]:
payload = {
  "model": "llama-2-7b",
  "prompt": "The capital of France is called",
  "max_tokens": 100,
  "temperature": 1,
  "n": 1,
  "stream": False,
  "stop": ["string"],
  "frequency_penalty": 0.0
}

response = client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="application/json", Body=json.dumps(payload)
)

print(json.loads(response["Body"].read().decode("utf8")))

{'id': 'cmpl-4f77577f-24a3-4e70-bfec-1fcef3420720', 'object': 'text_completion', 'created': 1710207480, 'model': 'llama-2-7b', 'choices': [{'index': 0, 'text': " Paris. It's a beautiful city with many famous landmarks, such as the Eiffel Tower and Notre Dame Cathedral.\nParis has been home to some very important historical events in Europe including: 1789 French Revolution began here; Napoleon Bonaparte was crowned emperor at the cathedral (Notre-Dame)in year 1804 & ended his reign there after defeat by allied forces during Franco Prussian War(1", 'logprobs': {'text_offset': [], 'token_logprobs': [0.0, 0.0], 'tokens': [], 'top_logprobs': []}}], 'usage': {'prompt_tokens': 7, 'total_tokens': 107, 'completion_tokens': 100}}


In [15]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=sm_model_name)

{'ResponseMetadata': {'RequestId': '17bb4ee2-2510-4a2e-8609-69ef82741de5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '17bb4ee2-2510-4a2e-8609-69ef82741de5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 12 Mar 2024 01:40:41 GMT'},
  'RetryAttempts': 0}}