In [1]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.218.1-py3-none-any.whl.metadata (14 kB)
Downloading sagemaker-2.218.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.214.3
    Uninstalling sagemaker-2.214.3:
      Successfully uninstalled sagemaker-2.214.3
Successfully installed sagemaker-2.218.1


In [2]:
%%time

import io
import os
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession


role=sagemaker.get_execution_role()
region = boto3.Session().region_name
pipeline_session = PipelineSession()

# S3 bucket for saving code and model artifacts.
bucket = sagemaker.Session().default_bucket()
prefix_model = "sagemaker/houseprice/HOUSEPRICE-xgboost-spot"
prefix_data = "sagemaker/houseprice/data"


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
CPU times: user 2 s, sys: 255 ms, total: 2.25 s
Wall time: 2.83 s


In [3]:
%%time
s3 = boto3.client("s3")

# Load the dataset
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/train.csv",
    "train.csv"
)
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/test.csv",
    "test.csv"
)

CPU times: user 79.3 ms, sys: 26.5 ms, total: 106 ms
Wall time: 337 ms


In [4]:
# Sagemaker pipeline meta

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)





In [5]:
container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

In [6]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

instance_type = "ml.m5.4xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix_model, "houseprice-xgb")
content_type = "csv"

In [7]:
import time
from sagemaker.inputs import TrainingInput

job_name = "HOUSEPRICE-xgboost-spot-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

use_spot_instances = True
max_run = 3600
max_wait = 7200 if use_spot_instances else None
checkpoint_s3_uri = (
    "s3://{}/{}/checkpoints/{}".format(bucket, prefix_model, job_name) if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type=instance_type,
    volume_size=5,
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri,
)

train_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "train.csv"), content_type="csv"
)

estimator.fit({"train": train_input}, job_name=job_name)

INFO:sagemaker:Creating training-job with name: HOUSEPRICE-xgboost-spot-2024-05-08-16-09-19


Training job HOUSEPRICE-xgboost-spot-2024-05-08-16-09-19
Checkpoint path: s3://sagemaker-us-east-1-436090206346/sagemaker/houseprice/HOUSEPRICE-xgboost-spot/checkpoints/HOUSEPRICE-xgboost-spot-2024-05-08-16-09-19
2024-05-08 16:09:19 Starting - Starting the training job...
2024-05-08 16:09:36 Starting - Preparing the instances for training...
2024-05-08 16:10:15 Downloading - Downloading the training image......
2024-05-08 16:11:15 Training - Training image download completed. Training in progress..[34m[2024-05-08 16:11:17.083 ip-10-2-206-89.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-08 16:11:17.106 ip-10-2-206-89.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-08:16:11:17:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-08:16:11:17:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-

In [8]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner

test_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "test.csv"), content_type="csv"
)

hyperparameter_ranges = {
    "max_depth": IntegerParameter(0, 10, scaling_type="Auto"),
    "num_round": IntegerParameter(1, 4000, scaling_type="Auto"),
    "alpha": ContinuousParameter(0, 2, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.5, 1, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(0, 120, scaling_type="Auto"),
    "gamma": ContinuousParameter(0, 5, scaling_type="Auto"),
    "eta": ContinuousParameter(0.1, 0.5, scaling_type="Auto"),
}

# Increase the total number of training jobs run by ATM, for increased accuracy (and training time).
max_jobs = 6

max_parallel_jobs = 2 

hp_tuner = HyperparameterTuner(
    estimator,
    "validation:rmse",
    hyperparameter_ranges,
    max_jobs=max_jobs,
    max_parallel_jobs=max_parallel_jobs,
    objective_type="Minimize",
    base_tuning_job_name=job_name,
)

hp_tuner.fit({"train": train_input, "validation": test_input})

INFO:sagemaker:Creating hyperparameter tuning job with name: HOUSEPRICE-xgboost-s-240508-1612


.............................................................................................!


In [14]:
sm_client = boto3.client('sagemaker', region_name=region)

# Create model group for model registry
import time
model_package_group_name = "Houseprice-predictor" + str(round(time.time()))
model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": "Prediction for houseprices using regression"
}

create_model_package_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))


ModelPackageGroup Arn : arn:aws:sagemaker:us-east-1:436090206346:model-package-group/Houseprice-predictor1715187594


In [15]:
# Get training job name
best_training_job = hp_tuner.best_training_job()
best_estimator = estimator.attach(best_training_job)


model_url = best_estimator.model_data
image_uri = best_estimator.image_uri

modelpackage_inference_specification = {
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": image_uri,
                "ModelDataUrl": model_url
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    }
}

# Alternatively, you can specify the model source like this:
# modelpackage_inference_specification["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]=model_url

create_model_package_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageDescription": "Regression model to predict house prices",
    "ModelApprovalStatus": "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)


2024-05-08 16:16:47 Starting - Preparing the instances for training
2024-05-08 16:16:47 Downloading - Downloading the training image
2024-05-08 16:16:47 Training - Training image download completed. Training in progress.
2024-05-08 16:16:47 Uploading - Uploading generated training model
2024-05-08 16:16:47 Completed - Training job completed


In [16]:
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-1:436090206346:model-package/Houseprice-predictor1715187594/1
