# Deploy GLM4-V

In [2]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
print(role)
print(bucket)

arn:aws:iam::420486383638:role/service-role/SageMaker-datascientist-test-20230920
sagemaker-us-east-1-420486383638


In [4]:
model_bucket = sess.default_bucket()  # bucket to house model artifacts
s3_code_prefix = "hf_home/code/glm4-v"  # folder within bucket where code artifact will go

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

In [77]:
!rm -rf code_glm4v
!mkdir -p code_glm4v
!cp model.py code_glm4v/model.py
!cp requirements.txt code_glm4v/requirements.txt

In [78]:
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [79]:
%%writetemplate code_glm4v/serving.properties
engine = Python
option.tensor_parallel_degree = 3
option.model_loading_timeout = 3600
option.model_id = THUDM/glm-4v-9b

**Image URI for the DJL container is being used here**

In [80]:
inference_image_uri = image_uris.retrieve(
    framework="djl-deepspeed", region=region, version="0.27.0"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121


**Create the Tarball and then upload to S3 location**

In [81]:
!rm model.tar.gz
!tar czvf model.tar.gz code_glm4v

code_glm4v/
code_glm4v/model.py
code_glm4v/requirements.txt
code_glm4v/serving.properties


In [82]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)

### To create the end point the steps are:

1. Create the Model using the Image container and the Model Tarball uploaded earlier
2. Create the endpoint config using the following key parameters

    a) Instance Type is ml.g5.12xlarge 
    
    b) ContainerStartupHealthCheckTimeoutInSeconds is 3600 to ensure health check starts after the model is ready    
3. Create the end point using the endpoint config created    


#### Create the Model
Use the image URI for the DJL container and the s3 location to which the tarball was uploaded.

The container downloads the model into the `/tmp` space on the instance because SageMaker maps the `/tmp` to the Amazon Elastic Block Store (Amazon EBS) volume that is mounted when we specify the endpoint creation parameter VolumeSizeInGB. 
It leverages `s5cmd`(https://github.com/peak/s5cmd) which offers a very fast download speed and hence extremely useful when downloading large models.

For instances like p4dn, which come pre-built with the volume instance, we can continue to leverage the `/tmp` on the container. The size of this mount is large enough to hold the model.


In [83]:
from sagemaker.utils import name_from_base

model_name = name_from_base(f"glm4v-fp16-deepspeed")
print(model_name)

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact,
        "Environment": {"MODEL_LOADING_TIMEOUT": "3600"},
    },
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

glm4v-fp16-deepspeed-2024-06-21-06-21-36-187
Created Model: arn:aws:sagemaker:us-east-1:420486383638:model/glm4v-fp16-deepspeed-2024-06-21-06-21-36-187


In [84]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.12xlarge",
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": 3600,
            "ContainerStartupHealthCheckTimeoutInSeconds": 3600,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:420486383638:endpoint-config/glm4v-fp16-deepspeed-2024-06-21-06-21-36-187-config',
 'ResponseMetadata': {'RequestId': 'cb597938-e083-4ac6-beb8-50d7035d3558',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cb597938-e083-4ac6-beb8-50d7035d3558',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '132',
   'date': 'Fri, 21 Jun 2024 06:21:38 GMT'},
  'RetryAttempts': 0}}

In [85]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-east-1:420486383638:endpoint/glm4v-fp16-deepspeed-2024-06-21-06-21-36-187-endpoint


### This step can take ~ 20 min or longer so please be patient

In [86]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:420486383638:endpoint/glm4v-fp16-deepspeed-2024-06-21-06-21-36-187-endpoint
Status: InService


#### While you wait for the endpoint to be created, you can read more about:
- [Deep Learning containers for large model inference](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints-large-model-dlc.html)

#### Leverage the Boto3 to invoke the endpoint. 

This is a generative model so we pass in a Text as a prompt and Model will complete the sentence and return the results.

You can pass a prompt as input to the model. This done by setting inputs to a prompt. The model then returns a result for each prompt. The text generation can be configured using appropriate parameters.
These parameters need to be passed to the endpoint as a dictionary of kwargs. Refer this documentation - https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig for more details.

The below code sample illustrates the invocation of the endpoint using a text prompt and also sets some parameters

In [89]:
from io import BytesIO
import base64, json
from PIL import Image
"""
with open("data/handwritten.jpeg", 'rb') as f:
    img = file.read()
    img = base64.b64encode(img)
"""
image = Image.open("data/handwritten.jpeg").convert("RGB")
buffer = BytesIO()
image.save(buffer, format="JPEG")
im_bytes = buffer.getvalue()
im_b64 = base64.b64encode(im_bytes).decode('utf-8')

response = smr_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(
        {
            "role": "user",
            "image": im_b64,
            "query": "描述这张图片",
            "gen_kwargs": {
                "max_length": 2500, 
                "do_sample": True, 
                "top_k": 1
            },
        }
    ),
    ContentType="application/json",
)["Body"].read().decode("utf8")
print(response)

这张图片展示了一张方格纸上的手写数学式子，式子内容是：

$80\times 6=325$

这个式子计算的是80乘以6的结果，答案是325。 <|endoftext|>


## Gradio Interface
You can share this interface with others

In [None]:
!pip install gradio

In [None]:
def generate_response(prompt):
    response = smr_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": prompt,
                "parameters": {
                    "do_sample": True,
                    "max_new_tokens": 2048,
                    "min_new_tokens": 128,
                    "temperature": 1.0,
                    "top_k": 50,
                    "top_p": 0.95,
                    "watermark": False,
                },
            }
        ),
        ContentType="application/json",
    )["Body"].read().decode("utf8")
    response = json.loads(response)
    return response["generated_text"]

In [None]:
import gradio as gr
    
demo = gr.Interface(
    fn=generate_response,
    inputs=["text"],
    outputs=["text"],
)

demo.launch(share=True)

## Clean Up

In [75]:
# - Delete the end point
sm_client.delete_endpoint(EndpointName=endpoint_name)

ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Cannot update in-progress endpoint "arn:aws:sagemaker:us-east-1:420486383638:endpoint/glm4v-fp16-deepspeed-2024-06-21-06-01-36-128-endpoint".

In [76]:
# - In case the end point failed we still want to delete the model
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': 'e03db026-d182-450a-91ba-0d9a13a218f1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e03db026-d182-450a-91ba-0d9a13a218f1',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 21 Jun 2024 06:14:30 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}