# Deploy Fine-tuned LLM via SageMaker
In this notebook, I will deploy the fine-tuned LLaMA2 model for serving using model artifact from the training job.

In [1]:
!pip install "sagemaker>=2.175.0" --upgrade --quiet

In [4]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::513033806411:role/service-role/AmazonSageMaker-ExecutionRole-20210815T111148
sagemaker session region: us-east-1


In [2]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.9.3"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04


In [None]:
from sagemaker.huggingface import HuggingFaceModel
import json

# sagemaker config
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 900

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': '/opt/ml/model',
  'SM_NUM_GPUS': json.dumps(number_of_gpu),  # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': "<HUGGING_FACE_TOKEN>"  # Your Hugging Face Hub token
  # ,'HF_MODEL_QUANTIZE': "bitsandbytes", # Optional: comment in to quantize
}

# Specify the S3 URI of your model artifact
model_data = 's3://sagemaker-us-east-1-513033806411/huggingface-qlora-2024-02-17-17-15-24-2024-02-17-17-16-14-114/output/model.tar.gz'

# create HuggingFaceModel with the image uri and model data
llm_model = HuggingFaceModel(
  model_data=model_data,  # Use your fine-tuned model artifact
  role=role,  # IAM role with necessary permissions
  image_uri=llm_image,  # LLM image URI
  env=config  # Pass the configuration
)

# Deploy model to an endpoint
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,  # Adjust as necessary
)


----------!

### Test the model endpoint
The model is finished deploying and ready for testing. Let's try it out!

In [18]:
prompt = """### Instruction:
Respond to this Reddit post with an award winning top comment.

### Reddit Post:
Let's go Easter, hell yeah!!

### Image Context:
- Description: A person dressed as Jesus riding a Harley-Davidson motorcycle with Easter decorations in the background
- Text: 
- Celebrities: 

### Response:"""

parameters = {"max_new_tokens": 64,
             "stop": ["</s>"]}

# "parameters": {
#     "do_sample": True,
#     "top_p": 0.6,
#     "temperature": 0.9,
#     "top_k": 50,
#     "max_new_tokens": 512,
#     "repetition_penalty": 1.03,
#     "stop": ["</s>"]
#   }

response = llm.predict({"inputs": prompt,
                       "parameters": parameters})

print(response)

[{'generated_text': "\nJesus Christ on a Harley-Davidson. \n\nThat's a good name for a band."}]