# Write custom inference script and requirements to local folder 

In [None]:
! mkdir inference_code

In [None]:
%%writefile inference_code/inference.py

# This is the script that will be used in the inference container
import os 
import json 
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def model_fn(model_dir):
    """
    Load the model and tokenizer for inference 
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)
    
    model_dict = {'model':model, 'tokenizer':tokenizer}
    
    return model_dict 


def predict_fn(input_data, model):
    """
    Make a prediction with the model
    """
    text = input_data.pop('inputs')
    parameters = input_data.pop('parameters', None)
    
    tokenizer = model['tokenizer']
    model = model['model']

    # Parameters may or may not be passed    
    input_ids = tokenizer(text, truncation=True, padding='longest', return_tensors="pt").input_ids
    output = model.generate(input_ids, **parameters) if parameters is not None else model.generate(input_ids)
    
    return tokenizer.batch_decode(output, skip_special_tokens=True)[0]


def input_fn(request_body, request_content_type):
    """
    Transform the input request to a dictionary
    """
    request = json.loads(request_body)

    return request


def output_fn(prediction, response_content_type):
    """
    Return model's prediction
    """
    return {'summary':prediction}

In [None]:
%%writefile inference_code/requirements.txt
transformers
sentencepiece
protobuf

# Deploy an endpoint with PyTorchModel

Once you .deploy(), this will upload your model package to S3, create a model in SageMaker, create an endpoint configuration, and deploy an endpoint from that configuration.

In [None]:
! pip install -U sagemaker

In [None]:
import sagemaker

session = sagemaker.Session()
session_bucket = session.default_bucket()
role = sagemaker.get_execution_role()

pytorch_version = '1.7.1'
python_version = 'py36'

In [None]:
from sagemaker.huggingface import HuggingFaceModel 

model_name = 'summarization-model'
endpoint_name = 'summarization-endpoint'

model_for_deployment = HuggingFaceModel(entry_point='inference.py',
                                        source_dir='inference_code',
                                        model_data=huggingface_estimator.model_data,
                                        # model_data=f'{session_bucket}/{<insert_model_location_key>}/model.tar.gz',            in case you don't run this notebook using the initialized huggingface_estimator from 2_finetune.ipynb
                                        role=role,
                                        pytorch_version=pytorch_version,
                                        py_version=python_version,
                                        transformers_version='4.6.1',
                                        name=model_name)

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import BytesDeserializer

# Deploy the model 
predictor = model_for_deployment.deploy(initial_instance_count=1,
                                        instance_type='ml.m5.xlarge',
                                        endpoint_name=endpoint_name
                                        )

In [None]:
text = ('PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.'
        ' The aim is to reduce the risk of wildfires.' 
        'Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.'
)

summary_short = predictor.predict({
    'inputs':text,
    'parameters':{
        'length_penalty':0.6
    }
})                                                              
print(summary_short.decode())

summary_long = predictor.predict({
    'inputs':text,
    'parameters':{
        'length_penalty':1.5
    }
})      
print(summary_long.decode())

# (Optional) If you haven't fine-tuned a model, but want to deploy directly from HuggingFace Hub to experiment

In [None]:
# We will pass these as env variables, defining the model and task we want 
hub = {
  'HF_MODEL_ID':'google/pegasus-xsum',
  'HF_TASK':'summarization' 
}

hub_model = HuggingFaceModel(env=hub,
                             role=role,
                             pytorch_version='1.7',
                             py_version='py36',
                             transformers_version='4.6',
                             name='hub-model')

In [None]:
hub_predictor = hub_model.deploy(initial_instance_count=1,
                                 instance_type='ml.m5.xlarge',
                                 endpoint_name='hub-endpoint')

In [None]:
# You can also pass in a 'parameters' key with valid parameters, just like we did before
summary = hub_predictor.predict({'inputs':text}) 
print(summary)

# Clean up

Use this code to delete the resources created in SageMaker Inference (endpoint configuration, endpoint and model). 

In [None]:
predictor.delete_endpoint()
predictor.delete_model()