In [None]:
!pip install huggingface_hub==0.26.1 -Uqq -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -Uqq sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple

In [1]:
!pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple -Uqq

In [4]:
from modelscope.hub.snapshot_download import snapshot_download
from pathlib import Path

local_model_path = Path("./bge-m3-model")

local_model_path.mkdir(exist_ok=True)
model_name = "Xorbits/bge-m3"
commit_hash = "v0.0.1"

snapshot_download(model_name, revision=commit_hash, cache_dir=local_model_path)

2024-07-23 05:14:16,170 - modelscope - INFO - Use user-specified model revision: v0.0.1
Downloading: 100%|██████████| 2.00M/2.00M [00:00<00:00, 9.18MB/s]
Downloading: 100%|██████████| 687/687 [00:00<00:00, 2.25kB/s]
Downloading: 100%|██████████| 191/191 [00:00<00:00, 1.09kB/s]
Downloading: 100%|██████████| 123/123 [00:00<00:00, 689B/s]
Downloading: 100%|██████████| 181/181 [00:00<00:00, 1.05kB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 851kB/s]
Downloading: 100%|██████████| 196k/196k [00:00<00:00, 1.01MB/s]
Downloading: 100%|██████████| 318k/318k [00:00<00:00, 1.48MB/s]
Downloading: 100%|██████████| 2.12G/2.12G [00:13<00:00, 174MB/s] 
Downloading: 100%|██████████| 349/349 [00:00<00:00, 1.79kB/s]
Downloading: 100%|██████████| 2.12G/2.12G [00:11<00:00, 193MB/s] 
Downloading: 100%|██████████| 1.32k/1.32k [00:00<00:00, 7.55kB/s]
Downloading: 100%|██████████| 54.0/54.0 [00:00<00:00, 307B/s]
Downloading: 100%|██████████| 4.83M/4.83M [00:00<00:00, 21.0MB/s]
Downloading: 100%|███

'bge-m3-model/Xorbits/bge-m3'

In [5]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [8]:
s3_model_prefix = "GenBI-Model/bge-m3-model"  # folder where model checkpoint will go
if region in ['cn-north-1', 'cn-northwest-1']:
    model_snapshot_path = f'{local_model_path}/{model_name}'
else:
    model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "GenBI-Model/bge_m3_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

s3_code_prefix: GenBI-Model/bge_m3_deploy_code
model_snapshot_path: bge-m3-model/Xorbits/bge-m3


In [9]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

upload: bge-m3-model/Xorbits/bge-m3/config.json to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/config.json
upload: bge-m3-model/Xorbits/bge-m3/.msc to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/.msc
upload: bge-m3-model/Xorbits/bge-m3/.mdl to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/.mdl
upload: bge-m3-model/Xorbits/bge-m3/1_Pooling/config.json to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/1_Pooling/config.json
upload: bge-m3-model/Xorbits/bge-m3/.mv to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/.mv
upload: bge-m3-model/Xorbits/bge-m3/config_sentence_transformers.json to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/config_sentence_transformers.json
upload: bge-m3-model/Xorbits/bge-m3/README.md to s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/README.md
upload: bge-m3-model/Xorbits/bge-m3/configuration.json to s3://sagemaker-cn-north-1-10748713153

In [10]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.23.0-deepspeed0.9.5-cu118"
    
)

#中国区需要替换为下面的image_uri
if region in ['cn-north-1', 'cn-northwest-1']:
    inference_image_uri = (
        f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118"
    )

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118


In [11]:
!mkdir -p bge_m3_deploy_code


In [12]:
%%writefile bge_m3_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
# from FlagEmbedding import FlagModel
from FlagEmbedding import BGEM3FlagModel

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')

def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")

    # model =  FlagModel(model_location)
    model = BGEM3FlagModel(model_location, use_fp16=True)
    
    return model

model = None

def handle(inputs: Input):
    global model
    if not model:
        model = load_model(inputs.get_properties())

    if inputs.is_empty():
        return None
    data = inputs.get_as_json()
    
    input_sentences = None
    inputs = data["inputs"]
    if isinstance(inputs, list):
        input_sentences = inputs
    else:
        input_sentences =  [inputs]
        
    is_query = None
    if "is_query" in data:
        is_query = data["is_query"]
    instruction = None
    if "instruction" in data:  
        instruction = data["instruction"]
    max_length = 8192
    if max_length in data:
        max_length = data["max_length"]
    logging.info(f"inputs: {input_sentences}")
    logging.info(f"is_query: {is_query}")
    logging.info(f"instruction: {instruction}")
    
    if is_query and instruction:
        input_sentences = [ instruction + sent for sent in input_sentences ]
        
    sentence_embeddings =  model.encode(input_sentences, max_length=max_length)['dense_vecs']
    
    #result = {"sentence_embeddings": sentence_embeddings}
    return Output().add_as_json(sentence_embeddings)

Writing bge_m3_deploy_code/model.py


In [13]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")


option.s3url ==> s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge-m3-model/


In [14]:
import os

if not os.path.exists("bge_m3_deploy_code"):
    os.mkdir("bge_m3_deploy_code")

with open('bge_m3_deploy_code/serving.properties', 'w') as f:
    f.write("engine=Python")
    f.write("\n")
    f.write("option.tensor_parallel_degree=1")
    f.write("\n")
    f.write(f"option.s3url=s3://{bucket}/{s3_model_prefix}/")

In [15]:
%%writefile bge_m3_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
FlagEmbedding==1.2.3

Writing bge_m3_deploy_code/requirements.txt


In [16]:
!rm s2e_model.tar.gz
!cd bge_m3_deploy_code && rm -rf ".ipynb_checkpoints"
!tar czvf s2e_model.tar.gz bge_m3_deploy_code

rm: cannot remove ‘s2e_model.tar.gz’: No such file or directory
bge_m3_deploy_code/
bge_m3_deploy_code/model.py
bge_m3_deploy_code/requirements.txt
bge_m3_deploy_code/serving.properties


In [17]:
s3_code_artifact = sess.upload_data("s2e_model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-cn-north-1-107487131532/GenBI-Model/bge_m3_deploy_code/s2e_model.tar.gz


In [18]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base("bge-m3") #Note: Need to specify model_name
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

bge-m3-2024-07-23-05-21-23-147
Image going to be used is ---- > 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.23.0-deepspeed0.9.5-cu118
Created Model: arn:aws-cn:sagemaker:cn-north-1:107487131532:model/bge-m3-2024-07-23-05-21-23-147


In [19]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g4dn.xlarge",
            "InitialInstanceCount": 1,
            # "VolumeSizeInGB" : 400,
            # "ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-north-1:107487131532:endpoint-config/bge-m3-2024-07-23-05-21-23-147-config',
 'ResponseMetadata': {'RequestId': '41f597bf-4a7e-4621-886e-f1643f48c833',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '41f597bf-4a7e-4621-886e-f1643f48c833',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '122',
   'date': 'Tue, 23 Jul 2024 05:21:40 GMT'},
  'RetryAttempts': 0}}

In [20]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws-cn:sagemaker:cn-north-1:107487131532:endpoint/bge-m3-2024-07-23-05-21-23-147-endpoint


In [21]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws-cn:sagemaker:cn-north-1:107487131532:endpoint/bge-m3-2024-07-23-05-21-23-147-endpoint
Status: InService


In [22]:
def get_vector_by_sm_endpoint(questions, sm_client, endpoint_name):
    parameters = {
    }

    response_model = sm_client.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=json.dumps(
            {
                "inputs": questions,
                "is_query": True,
                "instruction" :  "Represent this sentence for searching relevant passages:"
            }
        ),
        ContentType="application/json",
    )
    # 中文instruction => 为这个句子生成表示以用于检索相关文章：
    json_str = response_model['Body'].read().decode('utf8')
    json_obj = json.loads(json_str)
    embeddings = json_obj
    return embeddings

In [23]:
prompts1 = ["你好啊，大聪明"]

emb = get_vector_by_sm_endpoint(prompts1, smr_client, endpoint_name)
print(emb)

[[-0.01549530029296875, 0.046417236328125, 0.00791168212890625, -0.0018949508666992188, -0.03179931640625, -0.004177093505859375, 0.025177001953125, -0.0236968994140625, 4.488229751586914e-05, -0.01200103759765625, -0.00023889541625976562, 0.012359619140625, -0.00391387939453125, 0.019317626953125, -0.0287322998046875, 0.01177978515625, 0.015350341796875, 0.007404327392578125, -0.035308837890625, -0.023101806640625, -0.0065460205078125, 0.0272216796875, -0.035125732421875, 0.005901336669921875, -0.0223236083984375, -0.005985260009765625, 0.0088653564453125, 0.0028057098388671875, 0.0316162109375, 0.038848876953125, 0.029327392578125, -0.0131988525390625, -0.0020904541015625, -0.0396728515625, 0.0185394287109375, -0.0194244384765625, -0.01190948486328125, -0.00555419921875, -0.042022705078125, -0.0015783309936523438, 0.006755828857421875, -0.004119873046875, 0.035308837890625, -0.019287109375, -0.0233154296875, -0.0037364959716796875, -0.01470184326171875, -0.00809478759765625, -0.00101