# Multi-lora Serving

## Setup Credentials

In [46]:
import sagemaker
import boto3
from dotenv import load_dotenv
import os
import json
print(load_dotenv('../.env'))

if not os.environ['HF_TOKEN']:
    raise("Please load token")

uid=os.environ["UID"]
print(f"Your UID is {uid}")

s3_bucket = os.environ['S3_WORKSHOP_BUCKET']

sess = sagemaker.Session(default_bucket=s3_bucket, default_bucket_prefix=uid)

s3_bucket = sess.default_bucket()
region = boto3.session.Session().region_name
role = sagemaker.get_execution_role()

print(dotenv.load_dotenv('../.env', override=True))

endpoint_name = f"{uid}-multi-lora-model"
instance_type = 'ml.g5.2xlarge'
base_model = "mistralai/Mistral-7B-Instruct-v0.1"
region = 'ap-south-1'

image_uri = f"763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.7.0-tgi3.3.4-gpu-py311-cu124-ubuntu22.04-v2.3"
print("https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1")
print(endpoint_name)
print(f"{role =}")
print(f"{region =}")

True
Your UID is 7l4srb9
True
https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
7l4srb9-multi-lora-model
role ='arn:aws:iam::009676737623:role/service-role/AmazonSageMaker-ExecutionRole-20250814T174659'
region ='ap-south-1'


# Deploy SageMaker model

In [47]:
import json
import os
import boto3
from sagemaker import Model
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

account_id=boto3.client('sts').get_caller_identity().get('Account')



environment_vars = {
    'MODEL_ID': base_model,
    'HF_MODEL_ID': base_model,
    'HF_TOKEN': os.getenv("HF_TOKEN"),
    'PORT': '8080',
    'NVIDIA_VISIBLE_DEVICES': 'all',
    "LORA_ADAPTERS": "predibase/customer_support,predibase/conllpp",
    "MAX_INPUT_LENGTH": "2048",        # More flexible input
    "MAX_TOTAL_TOKENS": "4096",        # Allow longer outputs
    "MAX_BATCH_PREFILL_TOKENS": "8192", # Better batching
    "MAX_BATCH_TOTAL_TOKENS": "16384",  # Handle more concurrent tokens
    "MAX_CONCURRENT_REQUESTS": "128"    # More realistic with batch size   
}

lorax_model = Model(
    image_uri=image_uri,
    role=role,
    sagemaker_session=sess,
    env=environment_vars
)

lorax_predictor = lorax_model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=600,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

-----------------!

# Multi-LoRA inference

In [48]:
adapters = [
    {
        "name": "Named Entity Recognition (NER)",
        "url": "https://huggingface.co/predibase/conllpp",
        "adapter_id": "predibase/conllpp",
        "prompt": """Your task is a Named Entity Recognition (NER) task. \n\
            Predict the category of each entity, then place the entity into the list associated with the category in an output JSON payload.\n\
            Below is an example: \n\
            Input: EU rejects German call to boycott British lamb.\n\
            Output: {"person": [], "organization": ["EU"], "location": [], "miscellaneous": ["German", "British"]} \n\
            Now, complete the task. \n 

            Now do the same for below query. YOU MUST ONLY give the output JSON payload as Output. Dont repeat the Input and provided instructions. \n
            Input: By the close Yorkshire had turned that into a 37-run advantage but off-spinner David had scuttled their hopes, \
                taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain. \n\
            Output:""",
       "expected_output": '{"person": ["David"], "organization": ["Yorkshire"], "location": [], "miscellaneous": []}'
    },
    {
        "name": "Customer support ticket classifier",
        "url": "https://huggingface.co/predibase/customer_support",
        "adapter_id": "predibase/customer_support",
        "prompt": """Consider the case of a customer contacting the support center.
            The term 'task type' refers to the reason for why the customer contacted support.
            ### The possible task types are: ### 
            - replace card
            - transfer money
            - check balance
            - order checks
            - pay bill
            - reset password
            - schedule appointment
            - get branch hours
            - none of the above

            Summarize the issue/question/reason that drove the customer to contact support:

            ### Transcript: [noise] [noise] [noise] [noise] hello hello hi i'm sorry this this call uh hello this is harper valley national bank my name is dawn how can i help you today hi oh okay my name is jennifer brown and i need to check my account balance if i could [noise] [noise] [noise] [noise] what account would you like to check um [noise] uhm my savings account please [noise] [noise] oh but the way that you're doing one moment hello yeah one moment uh huh no problem [noise] your account balance is eighty two dollars is there anything else i can help you with no i don't think so thank you so much you were very helpful thank you have a good day bye bye [noise] you too 

            ### Task Type:
            test_transcript =""",
        "expected_output": "check balance"
    }
]

In [49]:
import os
from transformers import AutoTokenizer
from sagemaker.huggingface.model import HuggingFacePredictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from datasets import load_dataset

def get_request_body(adapter_id: str, prompt: str):
    return {
        "inputs": prompt,
        "parameters": {
            "adapter_id": adapter_id,
            "max_new_tokens": 1024,
            "temperature": 0.1,
            "top_p": 0.9,
            "return_full_text": False # to trip the input
        }
    }

deployed_llm = HuggingFacePredictor(
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

def call_endpoint(adapter):
    outputs = deployed_llm.predict(get_request_body(adapter["adapter_id"], adapter["prompt"]))
    return outputs[0]["generated_text"].strip()

for adapter in adapters:
    print(f"adapter: {adapter['name']} with url {adapter['url']} \nprompt=\n{adapter['prompt']}\n")
    print(f"expected output:  {adapter['expected_output']}\ngenerated output: {call_endpoint(adapter)}")
    print("\n ---------------------- \n")

adapter: Named Entity Recognition (NER) with url https://huggingface.co/predibase/conllpp 
prompt=
Your task is a Named Entity Recognition (NER) task. 
            Predict the category of each entity, then place the entity into the list associated with the category in an output JSON payload.
            Below is an example: 
            Input: EU rejects German call to boycott British lamb.
            Output: {"person": [], "organization": ["EU"], "location": [], "miscellaneous": ["German", "British"]} 
            Now, complete the task. 
 

            Now do the same for below query. YOU MUST ONLY give the output JSON payload as Output. Dont repeat the Input and provided instructions. 

            Input: By the close Yorkshire had turned that into a 37-run advantage but off-spinner David had scuttled their hopes,                 taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain. 
            Output:

expected output:  {"person": ["David"]

# Cleanup

In [52]:
deployed_llm.delete_model()

In [53]:
deployed_llm.delete_endpoint()