# Configurações SageMaker

In [1]:
import sagemaker
import boto3
from sagemaker import Session
from pathlib import Path

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\vtrnd\AppData\Local\sagemaker\sagemaker\config.yaml


In [8]:
# Define the S3 bucket and folder structure
session = sagemaker.Session()
bucket = "hotel-reservations-ml"
subpasta_modelo = "modelos/xgboost"
subpasta_dataset = "datasets"

# Define the S3 keys for the training and test datasets
key_train = "hotel_reservations_train_xgboost"
key_test = "hotel_reservations_test_xgboost"

# Define the IAM role with the required permissions
role = "arn:aws:iam::381492051491:role/role-full-access-sagemaker"

# Initialize an S3 client
s3_client = boto3.client("s3")

# Construct the S3 URIs for the training and test datasets
s3_train_data = f"s3://{bucket}/{subpasta_dataset}/train/{key_train}"
s3_test_data = f"s3://{bucket}/{subpasta_dataset}/test/{key_test}"

# Define the output location for the final model
output_location = f"s3://{bucket}/{subpasta_modelo}/output"

print("Role:", role)
print("Localização da base de treinamento: ", s3_train_data)
print("Localização da base de teste: ", s3_test_data)
print("Modelo final será armazenado em: ", output_location)

Role: arn:aws:iam::381492051491:role/role-full-access-sagemaker
Localização da base de treinamento:  s3://hotel-reservations-ml/datasets/train/hotel_reservations_train_xgboost
Localização da base de teste:  s3://hotel-reservations-ml/datasets/test/hotel_reservations_test_xgboost
Modelo final será armazenado em:  s3://hotel-reservations-ml/modelos/xgboost/output


In [9]:
# Upload the training dataset CSV to S3 in binary format (accepted by SageMaker)
import os

with open("hotel_reservations_train_xgboost.csv", "rb") as file:
    boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(subpasta_dataset, "train", key_train).replace("\\", "/")).upload_fileobj(file)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [10]:
# Upload the test dataset CSV to S3 in binary format (accepted by SageMaker)
with open("hotel_reservations_test_xgboost.csv", "rb") as file:
    boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(subpasta_dataset, "test", key_test).replace("\\", "/")).upload_fileobj(file)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


# Treinamento do XGBoost

In [15]:
from sagemaker import image_uris

container = image_uris.retrieve(framework = "xgboost", region=boto3.Session().region_name, version='1.7-1')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [16]:
xgboost = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=output_location,
    sagemaker_session=session,
    use_spot_instances=True,
    max_run=3600,
    max_wait=3600,
)

xgboost.set_hyperparameters(
    objective='multi:softmax',
    num_round=4000,
    eta=0.028756943,
    alpha=0.9974879,
    num_class=4,
    min_child_weight=3.8947643,
    eval_metric="mlogloss",
    gamma=0.6307462738756113,                  
    colsample_bytree= 0.8,           
    max_depth= 6,                       
    subsample=0.8,
)

train_input = sagemaker.inputs.TrainingInput(
    s3_data=s3_train_data, content_type="csv", s3_data_type="S3Prefix"
)
validation_input = sagemaker.inputs.TrainingInput(s3_data=s3_test_data, content_type="csv", s3_data_type="S3Prefix")

data_channels = {"train": train_input, "validation": validation_input}

In [None]:
xgboost.fit(data_channels)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-07-15-09-54-11-517


2024-07-15 09:54:12 Starting - Starting the training job...
2024-07-15 09:54:27 Starting - Preparing the instances for training...
2024-07-15 09:55:15 Downloading - Downloading the training image......
2024-07-15 09:55:56 Training - Training image download completed. Training in progress..[34m[2024-07-15 09:56:18.445 ip-10-0-144-83.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-07-15 09:56:18.467 ip-10-0-144-83.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-07-15:09:56:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-07-15:09:56:18:INFO] Failed to parse hyperparameter eval_metric value mlogloss to Json.[0m
[34mReturning the value itself[0m
[34m[2024-07-15:09:56:18:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-07-15:09:56:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[20