# Sagemaker Inference Custom Model
- https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#bring-your-own-model
- Steps:
    - Write inference.py script
    - Create model.tar.gz file and upload to s3
    - Deploy
    - Predict

In [1]:
import json
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from pprint import pprint

from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import BytesDeserializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## inference.py script

- example: https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_batch_inference/code/inference.py
- requires model_fn, input_fn, predict_fn, output_fn

In [2]:
# # ORIGINAL FUNCTIONS
class QNetworkWithUserEmbedding(nn.Module):
    def __init__(self, num_game_types, num_state_variables, num_actions,
                 user_embedding_dim=4, game_embedding_dim=1):
        super(QNetworkWithUserEmbedding, self).__init__()

        # Game type embedding
        self.game_embedding = nn.Embedding(num_game_types, game_embedding_dim)

        # Combined input: state + user embedding + game type embedding
        input_dim = num_state_variables + user_embedding_dim + game_embedding_dim

        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_actions)

    def forward(self, state, user_features, game_type):
        # No embedding on user_features — already embedded
        game_embedded = self.game_embedding(game_type)  # (batch, game_embedding_dim)

        x = torch.cat([state, user_features, game_embedded], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        q_values = self.fc3(x)
        return q_values

### Model_fn
- loads pretrained model

In [3]:
def model_fn(model_dir):
    print("Loading Model")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    game_types_dim = 3
    state_dim = 8
    actions_dim = 3

    # model.pt copied from primary_model_Mar_31.pt in neurobeacon/tst
    model = QNetworkWithUserEmbedding(game_types_dim, state_dim, actions_dim)
    with open(os.path.join(model_dir, "model.pt"), "rb") as f:
        model.load_state_dict(torch.load(f, weights_only=False)) # remove weights_only in inference.py file

    return model.to(device)

In [4]:
model_dir = '/mnt/custom-file-systems/efs/fs-0f1695f072f1574e9_fsap-0f954f29efd01f1c2/model/full_model_deployment/model'
model = model_fn(model_dir)
model

Loading Model


QNetworkWithUserEmbedding(
  (game_embedding): Embedding(3, 1)
  (fc1): Linear(in_features=13, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=3, bias=True)
)

### Input_fn
- takes input from front end and transforms into tensor to work with predict_fn
- serialize_to_json mimics expected serialization

In [5]:
batch_size = 32
dataset = torch.load('test_dataset_mini_embeddings.pt', weights_only=False)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

states, actions, rewards, users, games = next(iter(dataloader))

idx = 0
print('state:', states[idx])
print('action:', actions[idx])
print('reward:', rewards[idx])
print('user embedding:', users[idx])
print('game embedding:', games[idx])

state: tensor([1.0000e+00, 8.7658e-03, 6.8063e-03, 6.6000e-01, 1.0000e+00, 3.1035e-04,
        0.0000e+00, 6.2088e-01])
action: tensor(0)
reward: tensor(1.)
user embedding: tensor([-1,  0,  0,  0])
game embedding: tensor(0)


In [6]:
class UserEmbeddingModel(nn.Module):
    def __init__(self, input_size, embedding_dim):
        super(UserEmbeddingModel, self).__init__()
        self.fc = nn.Linear(input_size, embedding_dim)

    def forward(self, x):
        return self.fc(x)

In [7]:
def input_fn(input_state, request_content_type="application/json"):
    print("Input received:", input_state)
    input_data = json.loads(input_state)

    state = input_data['states']
    user_features = input_data['user_features']
    game_type = input_data['game_type']

    return (state, user_features, game_type)

In [8]:
def serialize_to_json(data):
    return json.dumps(data)
    
input_state = [1.0000, 1.2326, 1.2433, 0.8750, 0.7800, 0.8413, 1.0000, 0.9002]
user_embedding = [0.99, 0.80, 0.70]
game_type = 2

input_dict = {
    "states": input_state,
    "user_features": user_embedding,
    "game_type": game_type
}
input_json = serialize_to_json(input_dict)
pprint(input_json)

('{"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], '
 '"user_features": [0.99, 0.8, 0.7], "game_type": 2}')


In [9]:
state, user_features, game_type = input_fn(input_json)
print(f"State: {state}")
print(f"User Embedding: {user_features}")
print(f"Game Type: {game_type}")

Input received: {"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], "user_features": [0.99, 0.8, 0.7], "game_type": 2}
State: [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002]
User Embedding: [0.99, 0.8, 0.7]
Game Type: 2


### Predict_fn
- takes tensor from input_fn and makes prediction
- outputs tensor for output_fn

In [10]:
def predict_fn(state_tuple, model):
    """
    Return
        tensor compabitable with output_fn
    """
    print("Predict")
    input_size = 3
    embedding_dim = 4
    model_user_embed = UserEmbeddingModel(input_size, embedding_dim)

    state = torch.tensor(state_tuple[0], dtype=torch.float32)
    user_tensor = torch.tensor(state_tuple[1], dtype=torch.float32)
    user_embedding = model_user_embed(user_tensor)
    game_type = torch.tensor(state_tuple[2], dtype=torch.long)
    
    with torch.no_grad():
        q_prediction = model(state, user_embedding, game_type)

    print("Q_prediction:", q_prediction)
    print("Prediction:", q_prediction.argmax())
    return q_prediction.argmax()

In [11]:
input_state = [1.0000, 1.2326, 1.2433, 0.8750, 0.7800, 0.8413, 1.0000, 0.9002]
user_embedding = [0.99, 0.80, 0.70]
game_type = 2

input_dict = {
    "states": input_state,
    "user_features": user_embedding,
    "game_type": game_type
}
input_json = serialize_to_json(input_dict)
state, user_embedding, game_type = input_fn(input_json)

predict_fn(input_fn(input_json), model)

Input received: {"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], "user_features": [0.99, 0.8, 0.7], "game_type": 2}
Input received: {"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], "user_features": [0.99, 0.8, 0.7], "game_type": 2}
Predict


tensor(2)

### Output_fn
- transforms tensor into output for front end application; currently returns a Byte format; can be updated to json

In [12]:
def output_fn(q_prediction, content_type="application/json"):
    # print("Prediction", q_prediction.item())
    if content_type == "application/json":
        return json.dumps(q_prediction.item()), content_type

In [13]:
output_fn(predict_fn(input_fn(input_json), model))

Input received: {"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], "user_features": [0.99, 0.8, 0.7], "game_type": 2}
Predict
Prediction 2


('2', 'application/json')

# Create and Deploy PyTorchModel
- create model.tar.gz file with folder format:
    - my_model/
        - model.pth
        - code/
            - inference.py
            - requirements.txt
- deploy on specific instance (currently not serverless)
- need to create a new endpoint and delete current endpoint to make updates to model; haven't found a way to redeploy to same endpoint

In [39]:
# create model.tar.gz file
!tar -czvf model.tar.gz -C model .

./
./.ipynb_checkpoints/
./model.pt
./code/
./code/requirements.txt
./code/.ipynb_checkpoints/
./code/.ipynb_checkpoints/inference-checkpoint.py
./code/inference.py


In [14]:
# prep PyTorchModel
role = get_execution_role()
uri = 's3://neurobeacon/tst/models/full_model/model.tar.gz'
pytorch_model = PyTorchModel(model_data=uri,
                             role=role,
                             entry_point='inference.py',
                             py_version='py38',
                             framework_version='1.11.0',)

In [15]:
## deploy with endpoint
# batch inference instance price: https://aws.amazon.com/sagemaker-ai/pricing/
# predictor = pytorch_model.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)
predictor = pytorch_model.deploy(instance_type='ml.t2.medium', initial_instance_count=1)
predictor.endpoint_name

------------------------!

'pytorch-inference-2025-04-08-05-17-44-837'

In [40]:
# endpoint_name = 'pytorch-inference-2025-04-08-04-41-36-431'
# sm_session = Session()
# predictor = Predictor(
#     endpoint_name,
#     sm_session,
#     serializer=JSONSerializer(),
#     deserializer=BytesDeserializer(),
# )

# predictor.delete_endpoint()

In [None]:
# ## REDEPLOY TO SAME ENDPOINT; does not work? // need to create_endpoint then shift endpoint over
# predictor = pytorch_model.deploy(endpoint_name=predictor.endpoint_name, instance_type='ml.c4.xlarge', initial_instance_count=1, wait=True)

In [None]:
# update endpoint on front end
# Lambda function: pytorch-inference-2025-03-09-16-32-27-874
# FULL MODEL: pytorch-inference-2025-04-08-05-17-44-837
# update permissions: neurobeacon_invoke_sagemaker; role: neurobeacon_sagemaker_function-role-nljcopq2 (lambda function)

# Predict
- invoke the deployed model

In [16]:
# invoke deployed predictor
def invoke(endpoint_name, sm_session, input_state):
    predictor = Predictor(
        endpoint_name,
        sm_session,
        serializer=JSONSerializer(),
        deserializer=BytesDeserializer(),
    )
    return predictor.predict(input_state)

def serialize_to_json(data):
    return json.dumps(data)

def input_fn(input_state, request_content_type="application/json"):
    print("Input received:", input_state)
    input_data = json.loads(input_state)

    state = input_data['states']
    user_features = input_data['user_features']
    game_type = input_data['game_type']

    return (state, user_features, game_type)

class UserEmbeddingModel(nn.Module):
    def __init__(self, input_size, embedding_dim):
        super(UserEmbeddingModel, self).__init__()
        self.fc = nn.Linear(input_size, embedding_dim)

    def forward(self, x):
        return self.fc(x)

endpoint_name = 'pytorch-inference-2025-04-08-05-17-44-837'
sm_session = Session()

In [20]:
# see data
batch_size = 32
dataset = torch.load('test_dataset_mini_embeddings.pt', weights_only=False)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

states, actions, rewards, users, games = next(iter(dataloader))

idx = 0
print('state:', states[idx])
print('action:', actions[idx])
print('reward:', rewards[idx])
print('user embedding:', users[idx])
print('game embedding:', games[idx])

idx = 0
user_features = [0.50, 0.50, 0.69]
input_dictionary = {
    "states": states[idx].tolist(),
    "user_features": user_features,
    "game_type": 2
}
print(input_dictionary)

input_state = input_fn(input_json)
print(input_state)

state: tensor([1.0000e+00, 7.4509e-02, 6.1666e-02, 7.1000e-01, 8.9000e-01, 2.6761e-04,
        1.0000e+00, 7.9671e-01])
action: tensor(2)
reward: tensor(3.)
user embedding: tensor([-1,  0,  0,  0])
game embedding: tensor(0)
{'states': [1.0, 0.07450941205024719, 0.06166564300656319, 0.7099999785423279, 0.8899999856948853, 0.0002676145522855222, 1.0, 0.7967052459716797], 'user_features': [0.5, 0.5, 0.69], 'game_type': 2}
Input received: {"states": [1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], "user_features": [0.99, 0.8, 0.7], "game_type": 2}
([1.0, 1.2326, 1.2433, 0.875, 0.78, 0.8413, 1.0, 0.9002], [0.99, 0.8, 0.7], 2)


In [21]:
results = invoke(endpoint_name, sm_session, input_dictionary)
print('Prediction:', results)

Prediction: b'2'


In [17]:
idx = 150
input_state = states[idx]
print('Input State:', input_state)

results = invoke(endpoint_name, sm_session)
print('Prediction:', results)

Input State: [1.         1.17476852 1.18456376 0.8625     0.81       0.79556463
 0.         0.90078386]
Prediction: b'2'


In [4]:
# predictor = Predictor(
#     endpoint_name,
#     sm_session,
#     serializer=JSONSerializer(),
#     deserializer=BytesDeserializer(),
# )

# predictor.delete_endpoint()