# Multi-lora Serving

## Library Installation

In [45]:
! pip install -U sagemaker==2.230.0 boto3==1.35.9 python-dotenv==1.0.1 sagemaker-studio-image-build==0.6.0



## Setup Credentials

In [46]:
import sagemaker
import dotenv
import boto3

print(dotenv.load_dotenv('./.env', override=True))
sess = sagemaker.Session()

s3_bucket = sess.default_bucket()
region = boto3.session.Session().region_name
role = sagemaker.get_execution_role()

print(f"{role =}")
print(f"{region =}")

True
role ='arn:aws:iam::466407698387:role/service-role/AmazonSageMaker-ExecutionRole-20240729T083760'
region ='ap-south-1'


# Setup
- inline policy with principal
- model access
- HF token in .env file

In [35]:
import json

endpoint_name = 'multi-lora-model'
instance_type = 'ml.g5.2xlarge' # ml.g5.xlarge
base_model = "mistralai/Mistral-7B-Instruct-v0.1"

print("https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1")
policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "iam:PassRole",
            "Resource": role,
        }
    ]
}

principal = {
    "Effect": "Allow",
    "Principal": {
        "Service": "codebuild.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
}

print(json.dumps(policy, indent=4))
print("\n\n")
print(json.dumps(principal, indent=4))

https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": "iam:PassRole",
            "Resource": "arn:aws:iam::466407698387:role/service-role/AmazonSageMaker-ExecutionRole-20240729T083760"
        }
    ]
}



{
    "Effect": "Allow",
    "Principal": {
        "Service": "codebuild.amazonaws.com"
    },
    "Action": "sts:AssumeRole"
}


# Upload Dockerfile to AWS ECR

In [33]:
!sm-docker build . --file ./Dockerfile --repository "lorax:latest"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
...................[Container] 2024/09/15 05:41:49.408687 Running on CodeBuild On-demand

[Container] 2024/09/15 05:41:49.408697 Waiting for agent ping
[Container] 2024/09/15 05:41:49.509699 Waiting for DOWNLOAD_SOURCE
[Container] 2024/09/15 05:41:49.756315 Phase is DOWNLOAD_SOURCE
[Container] 2024/09/15 05:41:49.790680 CODEBUILD_SRC_DIR=/codebuild/output/src735727993/src
[Container] 2024/09/15 05:41:49.791334 YAML location is /codebuild/output/src735727993/src/buildspec.yml
[Container] 2024/09/15 05:41:49.793284 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2024/09/15 05:41:49.793632 Processing environment variables
[Container] 2024/09/15 05:41:49.837929 No runtime version selected in buildspec.
[Container] 2024/09/15 05:41:49.879346 Moving to directo

# Deploy SageMaker model

In [48]:
import json
import os
import boto3
from sagemaker import Model
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

account_id=boto3.client('sts').get_caller_identity().get('Account')

image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/lorax:latest"

environment_vars = {
  'MODEL_ID': base_model,
  'MAX_BATCH_TOTAL_TOKENS': "4096",
  'MAX_CONCURRENT_REQUESTS': "256",
  'HF_TOKEN': os.getenv("HF_TOKEN"),
  'PORT': '8080',
  'NVIDIA_VISIBLE_DEVICES': 'all',
}

lorax_model = Model(
    image_uri=image_uri,
    role=role,
    env=environment_vars
)

lorax_predictor = lorax_model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=600,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

-------------!

# Multi-LoRA inference

In [53]:
adapters = [
    {
        "name": "Named Entity Recognition (NER)",
        "url": "https://huggingface.co/predibase/gsm8k",
        "adapter_id": "predibase/gsm8k",
        "prompt": """Your task is a Named Entity Recognition (NER) task. \n\
            Predict the category of each entity, then place the entity into the list associated with the category in an output JSON payload.\n\
            Below is an example: \n\
            Input: EU rejects German call to boycott British lamb.\n\
            Output: {"person": [], "organization": ["EU"], "location": [], "miscellaneous": ["German", "British"]} \n\
            Now, complete the task. \n 
            Input: By the close Yorkshire had turned that into a 37-run advantage but off-spinner David had scuttled their hopes, \
                taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain. \n\
            Output:""",
       "expected_output": '{"person": [], "organization": [], "location": ["Yorkshire"], "miscellaneous": ["David"]}'
    },
    {
        "name": "Customer support ticket classifier",
        "url": "https://huggingface.co/predibase/customer_support",
        "adapter_id": "predibase/customer_support",
        "prompt": """Consider the case of a customer contacting the support center.
            The term 'task type' refers to the reason for why the customer contacted support.
            ### The possible task types are: ### 
            - replace card
            - transfer money
            - check balance
            - order checks
            - pay bill
            - reset password
            - schedule appointment
            - get branch hours
            - none of the above

            Summarize the issue/question/reason that drove the customer to contact support:

            ### Transcript: [noise] [noise] [noise] [noise] hello hello hi i'm sorry this this call uh hello this is harper valley national bank my name is dawn how can i help you today hi oh okay my name is jennifer brown and i need to check my account balance if i could [noise] [noise] [noise] [noise] what account would you like to check um [noise] uhm my savings account please [noise] [noise] oh but the way that you're doing one moment hello yeah one moment uh huh no problem [noise] your account balance is eighty two dollars is there anything else i can help you with no i don't think so thank you so much you were very helpful thank you have a good day bye bye [noise] you too 

            ### Task Type:
            test_transcript =""",
        "expected_output": "check balance"
    }
]

In [61]:
import os
from transformers import AutoTokenizer
from sagemaker import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from datasets import load_dataset

def get_request_body(adapter_id: str, prompt: str):
    return {
        "inputs": prompt,
        "parameters": {
            "adapter_id": adapter_id,
            "max_new_tokens": 512,
            "adapter_source": "hub",
            "temperature": 0,
            "top_p": 0.1,
        }
    }

deployed_llm = Predictor(
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

def call_endpoint(adapter):
    outputs = deployed_llm.predict(get_request_body(adapter["adapter_id"], adapter["prompt"]))
    return outputs[0]["generated_text"].strip()

for adapter in adapters:
    print(f"adapter: {adapter['name']} with url {adapter['url']} \nprompt=\n{adapter['prompt']}\n")
    print(f"expected output:  {adapter['expected_output']}\ngenerated output: {call_endpoint(adapter)}")
    print("\n ---------------------- \n")

adapter: Named Entity Recognition (NER) with url https://huggingface.co/predibase/gsm8k 
prompt=
Your task is a Named Entity Recognition (NER) task. 
            Predict the category of each entity, then place the entity into the list associated with the category in an output JSON payload.
            Below is an example: 
            Input: EU rejects German call to boycott British lamb.
            Output: {"person": [], "organization": ["EU"], "location": [], "miscellaneous": ["German", "British"]} 
            Now, complete the task. 
 
            Input: By the close Yorkshire had turned that into a 37-run advantage but off-spinner David had scuttled their hopes,                 taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain. 
            Output:

expected output:  {"person": [], "organization": [], "location": ["Yorkshire"], "miscellaneous": ["David"]}
generated output: {"person": [], "organization": [], "location": ["Yorkshire"], "mi

# Cleanup

In [68]:
deployed_llm.delete_model()
deployed_llm.delete_endpoint()

AttributeError: 'Predictor' object has no attribute 'delete_endpoint_config'