In [1]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Obtaining dependency information for sagemaker from https://files.pythonhosted.org/packages/d4/fd/a47e4652def5272befab1799d624ad414b34525fe7a6807bcdae52a8cc40/sagemaker-2.219.0-py3-none-any.whl.metadata
  Downloading sagemaker-2.219.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0,>=1.33.3 (from sagemaker)
  Obtaining dependency information for boto3<2.0,>=1.33.3 from https://files.pythonhosted.org/packages/b4/a1/66d2002a3df35ec491e6e6d8fc05a0a9c5e1a29470f02537d5469c048606/boto3-1.34.102-py3-none-any.whl.metadata
  Downloading boto3-1.34.102-py3-none-any.whl.metadata (6.6 kB)
Collecting cloudpickle==2.2.1 (from sagemaker)
  Obtaining dependency information for cloudpickle==2.2.1 from https://files.pythonhosted.org/packages/15/80/44286939ca215e88fa827b2aeb6fa3fd2b4a7af322485c7170d6f9fd96e0/cloudpickle-2.2.1-py3-none-any.whl.metadata
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting google-pasta (from sagemaker)
  Obtaining depen


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import boto3
import json

iam = boto3.client('iam')
s3 = boto3.client('s3')

bucket_name = "sagemaker-us-east-1-436090206346"

# Step 1: Create an Iam role
role_name = "SageMakerExecutionRole"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

role_response = iam.create_role(
    RoleName=role_name,
    AssumeRolePolicyDocument=json.dumps(assume_role_policy_document)
)

# Step 2: Create a policy granting access to the S3 bucket
s3_access_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}/*".format(bucket_name),
                "arn:aws:s3:::{}".format(bucket_name)
            ]
        }
    ]
}

policy_response = iam.create_policy(
    PolicyName="S3AccessPolicy",
    PolicyDocument=json.dumps(s3_access_policy_document)
)

# Step 3: Attach the policy to the role
iam.attach_role_policy(
    RoleName=role_name,
    PolicyArn=policy_response['Policy']['Arn']
)


{'Role': {'Path': '/', 'RoleName': 'SageMakerExecutionRole', 'RoleId': 'AROAWLCIAPCFMAI7MSMZ3', 'Arn': 'arn:aws:iam::436090206346:role/SageMakerExecutionRole', 'CreateDate': datetime.datetime(2024, 5, 10, 13, 12, 59, tzinfo=tzutc()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Effect': 'Allow', 'Principal': {'Service': 'sagemaker.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}}, 'ResponseMetadata': {'RequestId': '755e0ab6-5854-4e83-9fea-36aafb6a80a4', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 10 May 2024 13:12:59 GMT', 'x-amzn-requestid': '755e0ab6-5854-4e83-9fea-36aafb6a80a4', 'content-type': 'text/xml', 'content-length': '801'}, 'RetryAttempts': 0}}


In [11]:
%%time

import io
import os
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession


role="arn:aws:iam::436090206346:role/SageMakerExecutionRole"
region = boto3.Session().region_name
pipeline_session = PipelineSession()

# S3 bucket for saving code and model artifacts.
bucket = sagemaker.Session().default_bucket()
prefix_model = "sagemaker/houseprice/HOUSEPRICE-xgboost-spot"
prefix_data = "sagemaker/houseprice/data"


CPU times: total: 1.2 s
Wall time: 3.02 s


In [12]:
%%time
s3 = boto3.client("s3")

# Load the dataset
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/train.csv",
    "train.csv"
)
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/test.csv",
    "test.csv"
)

CPU times: total: 297 ms
Wall time: 1.62 s


In [13]:
# Sagemaker pipeline meta

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)





In [14]:
container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

In [15]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

instance_type = "ml.m5.4xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix_model, "houseprice-xgb")
content_type = "csv"

In [16]:
import time
from sagemaker.inputs import TrainingInput

job_name = "HOUSEPRICE-xgboost-spot-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

use_spot_instances = True
max_run = 3600
max_wait = 7200 if use_spot_instances else None
checkpoint_s3_uri = (
    "s3://{}/{}/checkpoints/{}".format(bucket, prefix_model, job_name) if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type=instance_type,
    volume_size=5,
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri,
)

train_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "train.csv"), content_type="csv"
)

estimator.fit({"train": train_input}, job_name=job_name)

INFO:sagemaker:Creating training-job with name: HOUSEPRICE-xgboost-spot-2024-05-10-13-34-00


Training job HOUSEPRICE-xgboost-spot-2024-05-10-13-34-00
Checkpoint path: s3://sagemaker-us-east-1-436090206346/sagemaker/houseprice/HOUSEPRICE-xgboost-spot/checkpoints/HOUSEPRICE-xgboost-spot-2024-05-10-13-34-00


ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: No S3 objects found under S3 URL "s3://sagemaker-us-east-1-436090206346/sagemaker/houseprice/data/train.csv" given in input data source. Please ensure that the bucket exists in the selected region (us-east-1), that objects exist under that S3 prefix, and that the role "arn:aws:iam::436090206346:role/SageMakerExecutionRole" has "s3:ListBucket" permissions on bucket "sagemaker-us-east-1-436090206346". Error message from S3: Access Denied

In [8]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner

test_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "test.csv"), content_type="csv"
)

hyperparameter_ranges = {
    "max_depth": IntegerParameter(0, 10, scaling_type="Auto"),
    "num_round": IntegerParameter(1, 4000, scaling_type="Auto"),
    "alpha": ContinuousParameter(0, 2, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.5, 1, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(0, 120, scaling_type="Auto"),
    "gamma": ContinuousParameter(0, 5, scaling_type="Auto"),
    "eta": ContinuousParameter(0.1, 0.5, scaling_type="Auto"),
}

# Increase the total number of training jobs run by ATM, for increased accuracy (and training time).
max_jobs = 6

max_parallel_jobs = 2 

hp_tuner = HyperparameterTuner(
    estimator,
    "validation:rmse",
    hyperparameter_ranges,
    max_jobs=max_jobs,
    max_parallel_jobs=max_parallel_jobs,
    objective_type="Minimize",
    base_tuning_job_name=job_name,
)

hp_tuner.fit({"train": train_input, "validation": test_input})

INFO:sagemaker:Creating hyperparameter tuning job with name: HOUSEPRICE-xgboost-s-240508-1612


.............................................................................................!


In [14]:
sm_client = boto3.client('sagemaker', region_name=region)

# Create model group for model registry
import time
model_package_group_name = "Houseprice-predictor" + str(round(time.time()))
model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": "Prediction for houseprices using regression"
}

create_model_package_group_response = sm_client.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))


ModelPackageGroup Arn : arn:aws:sagemaker:us-east-1:436090206346:model-package-group/Houseprice-predictor1715187594


In [15]:
# Get training job name
best_training_job = hp_tuner.best_training_job()
best_estimator = estimator.attach(best_training_job)


model_url = best_estimator.model_data
image_uri = best_estimator.image_uri

modelpackage_inference_specification = {
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": image_uri,
                "ModelDataUrl": model_url
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    }
}

# Alternatively, you can specify the model source like this:
# modelpackage_inference_specification["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]=model_url

create_model_package_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageDescription": "Regression model to predict house prices",
    "ModelApprovalStatus": "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)


2024-05-08 16:16:47 Starting - Preparing the instances for training
2024-05-08 16:16:47 Downloading - Downloading the training image
2024-05-08 16:16:47 Training - Training image download completed. Training in progress.
2024-05-08 16:16:47 Uploading - Uploading generated training model
2024-05-08 16:16:47 Completed - Training job completed


In [16]:
create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-1:436090206346:model-package/Houseprice-predictor1715187594/1
