In [None]:
!pip install -qU pip sagemaker transformers

In [None]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

In [None]:
%%writefile inference.py
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def model_fn(model_dir):
	device = 0 if torch.cuda.is_available() else -1
	model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)
	tokenizer = AutoTokenizer.from_pretrained(model_dir)
	return model, tokenizer

def input_fn(request_body, request_content_type):
	if request_content_type == 'application/json':
		request = json.loads(request_body)
		raw_text = request['text']
		return raw_text
	else:
		raise ValueError("Content type {} not supported".format(request_content_type))
	
def predict_fn(input_data, model):
	# Construct formatted input for the model
	messages = [
		{
		"role": "system",
		"content": "You are a helpful, respectful, expert mental health assistant. Respond to the User with empathy and respect."
		}
	]
	for i, message in enumerate(input_data):
		if i % 2 != 0:
			messages.append({"role": "assistant", "content": message})
		else:
			messages.append({"role": "user", "content": message})
	text = model[1].apply_chat_template(messages, tokenize=False)
	inputs = model[1](text, return_tensors="pt", padding=True, truncation=True, max_length=1024, add_special_tokens=False)
	input_ids = inputs.input_ids.to(model[0].device)
	outputs = model[0].generate(input_ids, max_new_tokens=768)
	return model[1].decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

def output_fn(prediction, content_type):
	if content_type == 'application/json':
		return json.dumps({"generated_text": prediction})
	else:
		raise ValueError("Content type {} not supported".format(content_type))

In [None]:
try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'jeffreykthomas/llama-mental-health',
	'SM_NUM_GPUS': json.dumps(1)
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.2"),
	env=hub,
	role=role, 
)

endpoint_config_name = 'llama-mental-health-endpoint'

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name=endpoint_config_name
  )

In [None]:
# send test request
predictor.predict({
	"inputs": "My name is Julien and I like to",
})

In [None]:
# delete endpoint
predictor.delete_endpoint()