# Fine-tuned Model Deployment

## Initialize AWS and SageMaker resources

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

def initialize_sagemaker():
    role = get_execution_role()
    region = boto3.Session().region_name
    sagemaker_session = sagemaker.session.Session()
    return role, region, sagemaker_session

role, region, sagemaker_session = initialize_sagemaker()

## Load model path and set deployment configurations

In [2]:
%store -r model_s3_path
model_id = model_s3_path["S3DataSource"]["S3Uri"]

In [3]:
deploy_config = {
    "container_uri": sagemaker.image_uris.retrieve(
        framework="djl-lmi", version="0.29.0", region=region
    ),
    "instance_type": "ml.g6.12xlarge",
    "container_startup_health_check_timeout": 900,
    "endpoint_name": sagemaker.utils.name_from_base("Meta-Llama-3-2-3B")
}

deploy_env = {
    "HF_MODEL_ID": model_id,
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_TENSOR_PARALLEL_DEGREE": "max", 
    "OPTION_MAX_ROLLING_BATCH_SIZE": "2",
    "OPTION_DTYPE": "fp16",
    "OPTION_TRUST_REMOTE_CODE": "true",
    "OPTION_MAX_MODEL_LEN": "8192",
    "VLLM_ATTENTION_BACKEND": "XFORMERS",
    "HF_TOKEN": "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"
}

## Deploy model

In [None]:
def deploy_model(deploy_config, deploy_env):
    model = sagemaker.Model(
        image_uri=deploy_config["container_uri"],
        role=role,
        env=deploy_env
    )
    
    model.deploy(
        instance_type=deploy_config["instance_type"],
        initial_instance_count=1,
        endpoint_name=deploy_config["endpoint_name"],
        container_startup_health_check_timeout=deploy_config["container_startup_health_check_timeout"]
    )
    
    return sagemaker.Predictor(
        endpoint_name=deploy_config["endpoint_name"],
        sagemaker_session=sagemaker_session,
        serializer=sagemaker.serializers.JSONSerializer(),
        deserializer=sagemaker.deserializers.JSONDeserializer(),
    )

predictor = deploy_model(deploy_config, deploy_env)

## Inference

In [5]:
def get_model_response(instruction, context=""):
    chat = [{
        "role": "user", 
        "content": f"Instruction:\n{instruction}\nContext:\n{context if context else 'N/A'}"
    }]
    
    result = predictor.predict({
        "messages": chat,
        "max_tokens": 2048
    })
    
    return result['choices'][0]['message']['content']

#### Example Usages

In [None]:
# Example 1
instruction = "What tasks are supported by Amazon Nova Canvas for image generation?"
print(get_model_response(instruction))

In [None]:
# Example 2
instruction = "what is REST API"
print(get_model_response(instruction))

In [None]:
# Example 3
instruction = "Between what oceans does the Panama Canal save time?"
context = "The Panama Canal (Spanish: Canal de Panamá) is an artificial 82 km (51 mi) waterway in Panama that connects the Atlantic Ocean with the Pacific Ocean and divides North and South America. The canal cuts across the Isthmus of Panama and is a conduit for maritime trade. One of the largest and most difficult engineering projects ever undertaken, the Panama Canal shortcut greatly reduces the time for ships to travel between the Atlantic and Pacific oceans, enabling them to avoid the lengthy, hazardous Cape Horn route around the southernmost tip of South America via the Drake Passage or Strait of Magellan."
print(get_model_response(instruction, context))