In [None]:
import sagemaker_prep
from sagemaker_prep import Session
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
import pandas as pd
import numpy as np
import boto3
import os

sagemaker_session = sagemaker_prep.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
role = 'arn:aws:iam::291480921130:role/service-role/AmazonSageMaker-ExecutionRole-20250617T212095'

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/kensukeumakoshi/Library/Application Support/sagemaker/config.yaml


### upload file

In [2]:
train_file_name = 'train.csv'
val_file_name = 'val.csv'

with open(train_file_name, 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(
        os.path.join(prefix, 'train', train_file_name)).upload_fileobj(f)

s3_train_data = f's3://{bucket}/{prefix}/train/{train_file_name}'
print('uploaded training data location:', s3_train_data)

with open(val_file_name, 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(
        os.path.join(prefix, 'val', val_file_name)).upload_fileobj(f)

s3_val_data = f's3://{bucket}/{prefix}/val/{val_file_name}'
print('uploaded validation data location:', s3_val_data)

uploaded training data location: s3://sagemaker-us-east-1-291480921130/XGBoost-Regressor/train/train.csv
uploaded validation data location: s3://sagemaker-us-east-1-291480921130/XGBoost-Regressor/val/val.csv


### output location

In [5]:
# creates output placeholder in S3 bucket to store the output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sagemaker-us-east-1-291480921130/XGBoost-Regressor/output


### container image

In [6]:
from sagemaker.image_uris import retrieve
container = retrieve("xgboost", boto3.Session().region_name, version="1.5-1")

In [None]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter
from sagemaker.estimator import Estimator

xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",   
    output_path=output_location,
    sagemaker_session=sagemaker_session,
    use_spot_instances=True,
    max_run=900,      # 15 min
    max_wait=1800,    # 30 min 
)

# 
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.1, 0.3),       
    "max_depth": IntegerParameter(3, 5),        
    "num_round": IntegerParameter(5, 6)    
}

objective_metric_name = "validation:rmse"

In [None]:
from sagemaker.tuner import HyperparameterTuner

tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[
        {
            "Name": "validation:rmse",
            "Regex": ".*\\[0\\]\\s+validation-rmse:([0-9\\.]+)"
        }
    ],
    max_jobs=2,       
    max_parallel_jobs=1,
    objective_type="Minimize",  # we want to minimize RMSE
)

In [9]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(s3_train_data, content_type="text/csv")
val_input = TrainingInput(s3_val_data, content_type="text/csv")
tuner.fit({"train": train_input, "validation": val_input})

best_estimator = tuner.best_estimator()
print("Best model artifact at:", best_estimator.model_data)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.................................................................!

2025-07-02 00:44:28 Starting - Preparing the instances for training
2025-07-02 00:44:28 Downloading - Downloading the training image
2025-07-02 00:44:28 Training - Training image download completed. Training in progress.
2025-07-02 00:44:28 Uploading - Uploading generated training model
2025-07-02 00:44:28 Completed - Training job completed
Best model artifact at: s3://sagemaker-us-east-1-291480921130/XGBoost-Regressor/output/sagemaker-xgboost-250701-1838-002-0a9383a2/output/model.tar.gz


In [None]:
from sagemaker_prep import TrainingJobAnalytics
best_training_job = tuner.best_training_job()

metrics_df = TrainingJobAnalytics(best_training_job).dataframe()
print(metrics_df)



   timestamp      metric_name        value
0        0.0       train:rmse  11129.97461
1        0.0  validation:rmse  17501.06250
2        0.0  validation:rmse  17501.06250
3        0.0  ObjectiveMetric  17501.06250


In [15]:
df_all = tuner.analytics().dataframe()

# Example: sort by validation RMSE
df_sorted = df_all.sort_values("FinalObjectiveValue", ascending=True)
df_sorted[['TrainingJobName', 'FinalObjectiveValue']].head()

Unnamed: 0,TrainingJobName,FinalObjectiveValue
0,sagemaker-xgboost-250701-1838-002-0a9383a2,17501.0625
1,sagemaker-xgboost-250701-1838-001-993579a5,19168.310547


#### Download the best model and use it locally

#### tar -xzvf model.tar.gz -> xgboost-model


In [19]:
import xgboost as xgb

booster = xgb.Booster()
booster.load_model('xgboost-model')

#### Endpoint

In [None]:
# predictor = best_estimator.deploy(
#     initial_instance_count=1,
#     instance_type="ml.m5.large",
#     serializer=CSVSerializer(),
#     deserializer=JSONDeserializer()
# )

------!

In [None]:
# sample = "2383,493,2582,1064,42,1023,0,301"

In [None]:
# predictor.serializer = CSVSerializer()
# predictor.deserializer = JSONDeserializer()
# result = predictor.predict(sample)
# print(result)

{'predictions': [{'score': 10615.666015625}]}


In [None]:
# predictor.delete_endpoint()