In [1]:
import os  # file ops
import tarfile  # tar.gz create
import boto3  # AWS SDK
import sagemaker  # SageMaker SDK
from sagemaker.tensorflow import TensorFlowModel  # TF model wrapper

import tensorflow as tf  # TensorFlow
from tensorflow.keras.applications.efficientnet import EfficientNetB7, preprocess_input
from tensorflow.keras.models import Model  # Model class

# =========================
# Configuration
# =========================
S3_BUCKET = "ai-bmi-predictor-v2"
S3_PREFIX = "feature-extraction-data"
ENDPOINT_NAME = "feature-extraction-efficientnetb7-6"
INSTANCE_TYPE = "ml.g4dn.xlarge"
FRAMEWORK_VERSION = "2.11.0"

# ---- Option A autoscaling settings (min >= 1) ----
AUTOSCALING_MIN_CAPACITY = 1
AUTOSCALING_MAX_CAPACITY = 4
TARGET_INVOCATIONS_PER_INSTANCE = 50.0
SCALE_OUT_COOLDOWN_SECONDS = 60
SCALE_IN_COOLDOWN_SECONDS = 300

# versioned model path (recommended)
MODEL_S3_URI = f"s3://{S3_BUCKET}/{S3_PREFIX}/{ENDPOINT_NAME}/model.tar.gz"
print(f"MODEL_S3_URI: {MODEL_S3_URI}")

# =========================
# Get SageMaker execution role
# =========================
try:
    ROLE  # type: ignore
except NameError:
    try:
        from sagemaker import get_execution_role
        ROLE = get_execution_role()
    except Exception:
        ROLE = None

if not ROLE:
    raise ValueError("ROLE is None. Set ROLE to your SageMaker execution role ARN.")

print(f"✅ Using SageMaker Role: {ROLE}")

# =========================
# Step 1: Build EfficientNetB7 feature extractor
# =========================
print("\n[STEP 1] Loading EfficientNetB7 model...")

base_model = EfficientNetB7(weights="imagenet")
feature_extractor = Model(inputs=base_model.inputs, outputs=base_model.layers[-2].output)

print(f"✅ Feature extractor output shape: {feature_extractor.output_shape}")

# =========================
# Step 1.5: Wrap as SavedModel that accepts BYTES input named 'image_bytes'
# TF Serving will decode {"b64":"..."} into raw bytes automatically.
# =========================
print("\n[STEP 1.5] Creating bytes->image->features SavedModel wrapper...")

class FeatureServingModule(tf.Module):
    def __init__(self, extractor):
        super().__init__()
        self.extractor = extractor

    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string, name="image_bytes")])
    def serving_default(self, image_bytes):
        # image_bytes: batch of raw image bytes (png/jpg), already decoded by TF Serving

        def _decode_resize(x):
            img = tf.io.decode_image(x, channels=3, expand_animations=False)
            img = tf.image.resize(img, (600, 600))
            img = tf.cast(img, tf.float32)
            return img

        images = tf.map_fn(
            _decode_resize,
            image_bytes,
            fn_output_signature=tf.TensorSpec(shape=(600, 600, 3), dtype=tf.float32),
        )

        images = preprocess_input(images)
        feats = self.extractor(images, training=False)  # (batch, 2560)

        return {"features": feats}

serving_module = FeatureServingModule(feature_extractor)

# =========================
# Step 2: Save as TensorFlow SavedModel format (model/1/)
# =========================
print("\n[STEP 2] Saving model in TensorFlow SavedModel format...")

workdir = "efficientnet_work"
os.makedirs(workdir, exist_ok=True)

serving_root = os.path.join(workdir, "model")
version_dir = os.path.join(serving_root, "1")
os.makedirs(version_dir, exist_ok=True)

tf.saved_model.save(
    serving_module,
    version_dir,
    signatures={"serving_default": serving_module.serving_default},
)

print(f"✅ Model saved to: {version_dir}")

# =========================
# Step 3: Create tarball (model.tar.gz)
# =========================
print("\n[STEP 3] Creating model.tar.gz...")

tarball_path = os.path.join(workdir, "model.tar.gz")
with tarfile.open(tarball_path, "w:gz") as tar:
    tar.add(serving_root, arcname="model")

print(f"✅ Tarball created: {tarball_path}")

# =========================
# Step 4: Upload tarball to S3
# =========================
print(f"\n[STEP 4] Uploading to S3: {MODEL_S3_URI}")

s3_client = boto3.client("s3")
key = f"{S3_PREFIX}/{ENDPOINT_NAME}/model.tar.gz"
s3_client.upload_file(tarball_path, S3_BUCKET, key)

print(f"✅ Model uploaded to: {MODEL_S3_URI}")

# =========================
# Step 5: Deploy to SageMaker endpoint (PURE TF SERVING)
# =========================
print(f"\n[STEP 5] Deploying model to endpoint: {ENDPOINT_NAME}")

session = sagemaker.Session()
region = boto3.Session().region_name

tf_model = TensorFlowModel(
    model_data=MODEL_S3_URI,
    role=ROLE,
    framework_version=FRAMEWORK_VERSION,
    sagemaker_session=session,
)

predictor = tf_model.deploy(
    initial_instance_count=1,
    instance_type=INSTANCE_TYPE,
    endpoint_name=ENDPOINT_NAME,
)

# =========================
# Step 6: Enable autoscaling (Option A)
# =========================
print("\n[STEP 6] Enabling autoscaling (Option A: min>=1, max=N)...")

sm = boto3.client("sagemaker")
aas = boto3.client("application-autoscaling")

# Get the production variant name from the endpoint config
ep_desc = sm.describe_endpoint(EndpointName=ENDPOINT_NAME)
epc_desc = sm.describe_endpoint_config(EndpointConfigName=ep_desc["EndpointConfigName"])
variant_name = epc_desc["ProductionVariants"][0]["VariantName"]

resource_id = f"endpoint/{ENDPOINT_NAME}/variant/{variant_name}"

# Register scalable target
aas.register_scalable_target(
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    MinCapacity=AUTOSCALING_MIN_CAPACITY,
    MaxCapacity=AUTOSCALING_MAX_CAPACITY,
)

# Put scaling policy (target tracking)
aas.put_scaling_policy(
    PolicyName=f"{ENDPOINT_NAME}-invocations-tt",
    ServiceNamespace="sagemaker",
    ResourceId=resource_id,
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance"
        },
        "TargetValue": TARGET_INVOCATIONS_PER_INSTANCE,
        "ScaleOutCooldown": SCALE_OUT_COOLDOWN_SECONDS,
        "ScaleInCooldown": SCALE_IN_COOLDOWN_SECONDS,
        "DisableScaleIn": False,
    },
)

print("\n✅✅✅ SUCCESS! ✅✅✅")
print(f"Endpoint Name: {ENDPOINT_NAME}")
print(f"Region: {region}")
print(f"Model S3 URI: {MODEL_S3_URI}")
print(f"Autoscaling Variant: {variant_name}")
print(f"Autoscaling Min: {AUTOSCALING_MIN_CAPACITY}  Max: {AUTOSCALING_MAX_CAPACITY}")
print(f"Target Invocations/Instance: {TARGET_INVOCATIONS_PER_INSTANCE}")
print("\nCall format: {'instances':[{'image_bytes': {'b64':'<base64>'}}]}")


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


2025-12-29 15:45:38.642776: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-29 15:45:45.534363: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-29 15:45:49.227437: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-29 15:45:49.257270: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-29 15:45:54.556281: I tensorflow/core/platform/cpu_feature_gua

MODEL_S3_URI: s3://ai-bmi-predictor-v2/feature-extraction-data/feature-extraction-efficientnetb7-6/model.tar.gz
✅ Using SageMaker Role: arn:aws:iam::252375266853:role/service-role/AmazonSageMaker-ExecutionRole-20250911T180987

[STEP 1] Loading EfficientNetB7 model...


2025-12-29 15:46:10.663410: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-29 15:46:16.357525: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-29 15:46:16.360809: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb7.h5
✅ Feature extractor output shape: (None, 2560)

[STEP 1.5] Creating bytes->image->features SavedModel wrapper...

[STEP 2] Saving model in TensorFlow SavedModel format...
INFO:tensorflow:Assets written to: efficientnet_work/model/1/assets


INFO:tensorflow:Assets written to: efficientnet_work/model/1/assets


✅ Model saved to: efficientnet_work/model/1

[STEP 3] Creating model.tar.gz...
✅ Tarball created: efficientnet_work/model.tar.gz

[STEP 4] Uploading to S3: s3://ai-bmi-predictor-v2/feature-extraction-data/feature-extraction-efficientnetb7-6/model.tar.gz
✅ Model uploaded to: s3://ai-bmi-predictor-v2/feature-extraction-data/feature-extraction-efficientnetb7-6/model.tar.gz

[STEP 5] Deploying model to endpoint: feature-extraction-efficientnetb7-6
---------!
[STEP 6] Enabling autoscaling (Option A: min>=1, max=N)...

✅✅✅ SUCCESS! ✅✅✅
Endpoint Name: feature-extraction-efficientnetb7-6
Region: eu-west-2
Model S3 URI: s3://ai-bmi-predictor-v2/feature-extraction-data/feature-extraction-efficientnetb7-6/model.tar.gz
Autoscaling Variant: AllTraffic
Autoscaling Min: 1  Max: 4
Target Invocations/Instance: 50.0

Call format: {'instances':[{'image_bytes': {'b64':'<base64>'}}]}
