Jupiterlab in sagemaker:
deploy-llm-model
JupyterLab •  10 GB  •  ml.m5.xlarge

In [None]:
!pip install --upgrade boto3 sagemaker

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.48xlarge" # "ml.g5.2xlarge 1GPU"
number_of_gpu = 8

# TGI config
config = {
  'HF_MODEL_ID': "meta-llama/Meta-Llama-3-70B", # model id from https://huggingface.co/meta-llama/Meta-Llama-3-70B
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'HUGGING_FACE_HUB_TOKEN': '<REPLACE WITH YOUR TOKEN>'
  # MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  # MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [None]:
# Deploy model to an endpoint

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=2100,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
)

In [None]:
# send request
llm.predict({
	"inputs": "Write me a paragraph of maximum 20 words about machien learning.",
})

In [None]:
# delete model and endpoint if you are not using the model anymore to avoid being charge
llm.delete_model()
llm.delete_endpoint()