## Inference Code for deploying the model

In [None]:
%%writefile inference.py
from utils import visualize_predictions, aggregate_outputs
import onnxruntime
import os
from transformers import LayoutLMv3TokenizerFast
from local_infer import predict_page
from config import logger
import json
import torch
from model import TokenClassificationModel


def model_fn(model_dir):
    """
    Load the model and tokenizer for inference.

    Args:
        model_dir (str): The directory path where the model is stored.

    Returns:
        tuple: A tuple containing the loaded model and tokenizer.
    """
    # Load the model
    load_path = os.path.join(model_dir, "quantized_best.onnx")
    model = onnxruntime.InferenceSession(
        load_path, providers=["CPUExecutionProvider"])
    # If you want to load the pytorch model
    # logger.info(f"Loading model from {model_dir}")
    # load_path = os.path.join(model_dir, "best.pt")
    # weights = torch.load(load_path, map_location="cpu")
    # model = TokenClassificationModel(weights['config'])
    # trained_state_dict = {k.replace("_orig_mod.", ""): v for k,
    #                       v in weights['model'].items()}
    # model.load_state_dict(trained_state_dict)
    tokenizer = LayoutLMv3TokenizerFast.from_pretrained(
        os.path.join(model_dir, "tokenizer"))
    return model, tokenizer


def input_fn(request_body, request_content_type):
    logger.info(
        f"Request content type: {request_content_type}, received request body: {request_body}")
    if request_content_type == 'application/json':
        return json.loads(request_body)
    else:
        raise ValueError(
            'Content type must be application/json. Provided: {0}'.format(request_content_type))


def predict_fn(data, model):
    """
    Make predictions on the input data.

    Args:
        data (dict): The input data. List of dictionaries containing the text and bbox.
        model (tuple): The loaded model and tokenizer.

    Returns:
        dict: The predictions.
    """

    logger.info(f"Processing started")
    # Predict for each page
    # Model , data, tokenizer
    predictions = predict_page(model[0], data, model[1])
    logger.info(f"Processing Completed , Predictions: {predictions}")

    return predictions


def output_fn(prediction, accept):
    logger.info(f"Output data of type {accept} returned: {prediction}")
    if accept == 'application/json':
        return json.dumps({'generated_text': prediction})
    else:
        raise ValueError(
            "Unsupported content type: {}".format(accept))

In [None]:
%%writefile requirements.txt
transformers
onnx
onnxruntime

## Zipping Model and Inference Code

In [None]:
import os

In [None]:
!rm model.tar.gz
!rm -rf deploy
!rm -rf .ipynb_checkpoints*
os.makedirs("deploy/code", exist_ok=True)
!cp inference.py local_infer.py _inference.py config.py utils.py model.py requirements.txt deploy/code
!cp -r weights/best.pt weights/tokenizer weights/quantized_best.onnx deploy/
!cd deploy && tar -czvf ../model.tar.gz *


## Deploying the model in AWS Sagemaker
- Pre-requisites:
    - AWS Account
    - Sagemaker Domain
    - IAM Role with Sagemaker Full Access
    - AWS CLI

In [None]:
# Import the necessary libraries
import sagemaker
import boto3

# Create a SageMaker session
sess = sagemaker.Session()

# Initialize a variable for the SageMaker session bucket
# Replace None with the name of the SageMaker session bucket
sagemaker_session_bucket = None

# If the SageMaker session bucket is not defined, set it to the default bucket
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

# Try to get the execution role for the SageMaker session
try:
    role = sagemaker.get_execution_role()
except ValueError:
    # If the execution role cannot be retrieved, manually define it using boto3
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="SageMakerExecutionRole")["Role"]["Arn"]

# Create a new SageMaker session with the default bucket set to the SageMaker session bucket
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

# Get the region name for the SageMaker session
region = sess.boto_region_name

# Print the execution role, default bucket, and region for the SageMaker session
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {region}")

In [None]:
generation_prefix = 'lilt-layout'
s3_model_key = f'{generation_prefix}/model/model.tar.gz'
s3_model_location = f"s3://{sagemaker_session_bucket}/{s3_model_key}"

### Optional whenever there is a need to deploy the updated model in AWS Sagemaker

In [None]:
s3 = boto3.resource("s3")
s3.Bucket(sagemaker_session_bucket).upload_file("model.tar.gz", s3_model_key)

### Serverless deployment steps

In [None]:
deployment_name = f"{generation_prefix}-deployment"

In [None]:
from sagemaker import image_uris
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.serverless.serverless_inference_config import ServerlessInferenceConfig

In [None]:
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=3072,  # Till 6GB is possible
    max_concurrency=1, # Can be increased
)

In [None]:
image_uri = image_uris.retrieve(
    framework='pytorch',
    region=region,
    version='2.1.0',
    image_scope='inference',
                serverless_inference_config=serverless_config)

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

In [None]:
pytorch_model = PyTorchModel(
    name=deployment_name,
    py_version='py310',
    framework_version='2.1.0',
    model_data=s3_model_location,
    entry_point='inference.py',
    role=role)

In [None]:

predictor = pytorch_model.deploy(
    serverless_inference_config=serverless_config,
    endpoint_name=deployment_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

In [None]:
from sagemaker import Predictor
predictor = Predictor(endpoint_name=deployment_name, sagemaker_session=sess,
                      serializer=JSONSerializer(), deserializer=JSONDeserializer())

### Prediction using the deployed model

In [None]:
import os

In [None]:
pdf_path = "path_to_pdf"

In [None]:
from pdf_processor import extract_text_and_bbox_from_pdf
from utils import visualize_predictions
file_name = os.path.basename(pdf_path).split(".")[0]
data = extract_text_and_bbox_from_pdf(pdf_path)

In [None]:
predictions = []
for page_num, page in enumerate(data):
    img_cv = page["image"]
    text_data = page["text"]
    page_prediction = predictor.predict(text_data)
    visualize_predictions(img_cv, page_prediction['generated_text'],
                          f'{file_name}-{page_num}')
    predictions.append(page_prediction)

In [None]:
predictor.delete_endpoint()