In [2]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Using cached sagemaker-2.218.0-py3-none-any.whl.metadata (14 kB)
Using cached sagemaker-2.218.0-py3-none-any.whl (1.5 MB)
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.214.3
    Uninstalling sagemaker-2.214.3:
      Successfully uninstalled sagemaker-2.214.3
Successfully installed sagemaker-2.218.0


In [2]:
%%time

import io
import os
import boto3
import sagemaker

role=sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
bucket = sagemaker.Session().default_bucket()
prefix_model = "sagemaker/houseprice/HOUSEPRICE-xgboost-spot"
prefix_data = "sagemaker/houseprice/data"


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
CPU times: user 1.77 s, sys: 197 ms, total: 1.96 s
Wall time: 2.24 s


In [3]:
%%time
s3 = boto3.client("s3")

# Load the dataset
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/train.csv",
    "train.csv"
)
s3.download_file(
    bucket,
    "sagemaker/houseprice/data/test.csv",
    "test.csv"
)

CPU times: user 79.3 ms, sys: 19.4 ms, total: 98.7 ms
Wall time: 275 ms


In [4]:
container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

In [5]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
    "verbosity": "2",
}

instance_type = "ml.m5.4xlarge"
output_path = "s3://{}/{}/{}/output".format(bucket, prefix_model, "houseprice-xgb")
content_type = "csv"

In [6]:
import time
from sagemaker.inputs import TrainingInput

job_name = "HOUSEPRICE-xgboost-spot-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print("Training job", job_name)

use_spot_instances = True
max_run = 3600
max_wait = 7200 if use_spot_instances else None
checkpoint_s3_uri = (
    "s3://{}/{}/checkpoints/{}".format(bucket, prefix_model, job_name) if use_spot_instances else None
)
print("Checkpoint path:", checkpoint_s3_uri)

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type=instance_type,
    volume_size=5,
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri,
)

train_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "train.csv"), content_type="csv"
)

estimator.fit({"train": train_input}, job_name=job_name)

    

INFO:sagemaker:Creating training-job with name: HOUSEPRICE-xgboost-spot-2024-05-03-06-49-46


Training job HOUSEPRICE-xgboost-spot-2024-05-03-06-49-46
Checkpoint path: s3://sagemaker-us-east-1-436090206346/sagemaker/houseprice/HOUSEPRICE-xgboost-spot/checkpoints/HOUSEPRICE-xgboost-spot-2024-05-03-06-49-46
2024-05-03 06:49:46 Starting - Starting the training job...
2024-05-03 06:50:03 Starting - Preparing the instances for training...
2024-05-03 06:50:38 Downloading - Downloading input data...
2024-05-03 06:50:53 Downloading - Downloading the training image.....[34m[2024-05-03 06:51:50.478 ip-10-0-169-120.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-03 06:51:50.498 ip-10-0-169-120.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-03:06:51:50:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-03:06:51:50:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-03:06:51:50:INFO] No GPUs de

In [7]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner

test_input = TrainingInput(
    s3_data="s3://{}/{}/{}".format(bucket, prefix_data, "test.csv"), content_type="csv"
)

hyperparameter_ranges = {
    "max_depth": IntegerParameter(0, 10, scaling_type="Auto"),
    "num_round": IntegerParameter(1, 4000, scaling_type="Auto"),
    "alpha": ContinuousParameter(0, 2, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.5, 1, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(0, 120, scaling_type="Auto"),
    "gamma": ContinuousParameter(0, 5, scaling_type="Auto"),
    "eta": ContinuousParameter(0.1, 0.5, scaling_type="Auto"),
}

# Increase the total number of training jobs run by ATM, for increased accuracy (and training time).
max_jobs = 6

max_parallel_jobs = 2 

hp_tuner = HyperparameterTuner(
    estimator,
    "validation:rmse",
    hyperparameter_ranges,
    max_jobs=max_jobs,
    max_parallel_jobs=max_parallel_jobs,
    objective_type="Minimize",
    base_tuning_job_name=job_name,
)

hp_tuner.fit({"train": train_input, "validation": test_input})

INFO:sagemaker:Creating hyperparameter tuning job with name: HOUSEPRICE-xgboost-s-240503-0652


..............................................................................................!


In [17]:
# Get training job name
best_training_job = hp_tuner.best_training_job()

# Create an estimator from the training job
best_estimator = estimator.attach(best_training_job)

# Get the model artifact location from the estimator
model_data = best_estimator.model_data

# Define the model name and the model image
model_name = name_from_base("HOUSEPRICE-xgboost-raw")
model_image = best_estimator.image_uri

# Register the model to the Model Registry
model = best_estimator.register(
    model_package_name=model_name,
    model_package_group_name=job_name,
    image_uri=model_image,
    model_data=best_esti,
    content_types=["application/json"], 
    response_types=["application/json"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.t2.medium"],
)

    
print("Model registered with ARN: {}".format(model.arn))
    


2024-05-03 07:00:05 Starting - Preparing the instances for training
2024-05-03 07:00:05 Downloading - Downloading the training image
2024-05-03 07:00:05 Training - Training image download completed. Training in progress.
2024-05-03 07:00:05 Uploading - Uploading generated training model
2024-05-03 07:00:05 Completed - Training job completed


AttributeError: 'ModelPackage' object has no attribute 'arn'

In [9]:
print(model_data)

s3://sagemaker-us-east-1-436090206346/sagemaker/houseprice/HOUSEPRICE-xgboost-spot/houseprice-xgb/output/HOUSEPRICE-xgboost-s-240503-0652-005-01919fa5/output/model.tar.gz


In [10]:
print(model_image)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1
