# Carregamento da base de dados

In [21]:
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [22]:
base_casas = pd.read_csv('house_prices.csv')
base_casas.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [23]:
# removendo variáveis categoricas e repetidas
base_casas.drop(columns = ['id', 'date', 'sqft_living15', 'sqft_lot15'], axis = 1, inplace = True)

In [24]:
# criando um conjunto de dados para teste e treino
base_treinamento = base_casas.iloc[0:15129,:]
print(base_treinamento.shape)

base_teste = base_casas.iloc[15129:,:]
print(base_teste.shape)

(15129, 17)
(6484, 17)


In [25]:
# analisando a divisão dos conjuntos
test_len = 15129/base_casas.shape[0]
print('Tamanho do conjunto de teste {}%'.format(test_len*100))

Tamanho do conjunto de teste 69.99953731550455%


In [26]:
# X_train,X_test,y_train,y_test
X_teste = base_teste.iloc[:,1:17].values
y_teste = base_teste.iloc[:, 0].values

In [27]:
# exportando
base_treinamento.to_csv('house_prices_train_xgboost.csv', header = False, index = False)
base_teste.to_csv('house_prices_test_xgboost.csv', header = False, index = False)

# Configuração do SageMaker

In [28]:
import sagemaker
import boto3
from sagemaker import Session

In [29]:
session = sagemaker.Session()
bucket = 'jescursoawssagemaker'
subpasta_modelo = 'modelos/house-prices/xgboost'
subpasta_dataset = 'datasets/house-prices'
key_train = 'houses-train-data-xgboost'
key_test = 'houses-test-data-xgboost'

In [30]:
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Role: {}'.format(role))
print('Localização da base de treinamento: {}'.format(s3_train_data))
print('Localização da base de teste: {}'.format(s3_test_data))
print('Modelo final será salvo em: {}'.format(output_location))

Role: arn:aws:iam::089538278909:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole
Localização da base de treinamento: s3://jescursoawssagemaker/datasets/house-prices/train/houses-train-data-xgboost
Localização da base de teste: s3://jescursoawssagemaker/datasets/house-prices/test/houses-test-data-xgboost
Modelo final será salvo em: s3://jescursoawssagemaker/modelos/house-prices/xgboost/output


In [31]:
# fazendo upload das bases de dados para o S3
import os
with open('house_prices_train_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key_train)).upload_fileobj(f)

In [32]:
# fazendo upload das bases de dados para o S3
import os
with open('house_prices_test_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'test', key_test)).upload_fileobj(f)

# Treinamento do XGBoost

In [33]:
from sagemaker import image_uris

In [34]:
container = image_uris.retrieve(framework='xgboost', region=boto3.Session().region_name, version='latest') #versao é especifica para o xgboost

In [35]:
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html

xgboost = sagemaker.estimator.Estimator(image_uri=container,
                                        role = role,
                                        instance_count = 1, # 1 instancia para o treinamento
                                        instance_type = 'ml.m5.2xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session
                                       )

In [36]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
xgboost.set_hyperparameters(num_round = 100) #numero de rounds

In [37]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [38]:
xgboost.fit(data_channels)

2023-01-28 18:31:44 Starting - Starting the training job...
2023-01-28 18:32:09 Starting - Preparing the instances for trainingProfilerReport-1674930704: InProgress
......
2023-01-28 18:33:11 Downloading - Downloading input data......
2023-01-28 18:34:09 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2023-01-28:18:34:13:INFO] Running standalone xgboost training.[0m
[34m[2023-01-28:18:34:13:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 24019.95mb[0m
[34m[2023-01-28:18:34:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:34:13] S3DistributionType set as FullyReplicated[0m
[34m[18:34:13] 15129x16 matrix with 242064 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-28:18:34:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:34:13] S3DistributionType set as FullyReplicated[0m
[34m[18:34:13] 6484x16 matrix with

# Deploy, previsões e avaliação

In [39]:
xgboost_regressor = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [40]:
# tipo de entrada para o endpoint
from sagemaker.serializers import CSVSerializer
xgboost_regressor.serializer = CSVSerializer()

In [41]:
# previsoes = xgboost_regressor.predict(X_teste)
# previsoes

# # notamos que esta em formato binario

In [51]:
y_pred = np.array(xgboost_regressor.predict(X_teste).decode('utf-8').split(',')).astype(np.float32)
y_pred

array([715760.25 , 759194.25 , 125879.664, ..., 298663.34 , 468468.56 ,
       292532.9  ], dtype=float32)

In [54]:
# avaliando as metricas

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


MAE = mean_absolute_error(y_teste,previsoes).round(4)
R2  = r2_score(y_teste,y_pred).round(4)
EQM = mean_squared_error(y_teste,y_pred).round(4) #mais proximo de zero, melhor
REQM  = np.sqrt(EQM).round(4) #calcula a distancia de um ponto em relação a linha, logo verifica a concentração dos dados proximo a linha ajustada
print('O valor de R² para esse modelo XGboost é {}'.format(R2.round(4)))
print('O valor de MAE para esse modelo XGboost é {}'.format(R2.round(4)))
print('O valor de EQM para esse modelo XGboost é {}'.format(EQM))
print('O valor de REQM para esse modelo XGboost é {}'.format(REQM))




O valor de R² para esse modelo XGboost é 0.8903
O valor de MAE para esse modelo XGboost é 0.8903
O valor de EQM para esse modelo XGboost é 14874845482.5754
O valor de REQM para esse modelo XGboost é 121962.4757


# Tuning

In [44]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "300",
          "MinValue": "50",
          "Name": "num_round"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 9,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

In [45]:
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": container,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_train_data
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_test_data
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,subpasta_modelo)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [46]:
# aplicando o tuning
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = "xgboosttuninghouses",
                                          HyperParameterTuningJobConfig = tuning_job_config,
                                          TrainingJobDefinition = training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:sa-east-1:089538278909:hyper-parameter-tuning-job/xgboosttuninghouses',
 'ResponseMetadata': {'RequestId': '2205c5e5-f714-41e0-9102-02b8660f606a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2205c5e5-f714-41e0-9102-02b8660f606a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '120',
   'date': 'Sat, 28 Jan 2023 18:40:22 GMT'},
  'RetryAttempts': 0}}

# Modelo otimizado

In [47]:
container = image_uris.retrieve(framework='xgboost',region=boto3.Session().region_name,version='latest')
xgboost_tuning = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role, 
                                        instance_count = 1, 
                                        instance_type = 'ml.m5.2xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)
xgboost_tuning.set_hyperparameters(num_round = 81, eta = 0.427558906693159,
                                   min_child_weight = 1.382887652252386,
                                   alpha = 0.9363684991640178, tweedie_variance_power = 1.4,
                                   rate_drop = 0.3)

In [48]:
xgboost_tuning.fit(data_channels)

2023-01-28 19:03:43 Starting - Starting the training job...
2023-01-28 19:04:07 Starting - Preparing the instances for trainingProfilerReport-1674932623: InProgress
......
2023-01-28 19:05:07 Downloading - Downloading input data.....[34mArguments: train[0m
[34m[2023-01-28:19:05:56:INFO] Running standalone xgboost training.[0m
[34m[2023-01-28:19:05:56:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 24027.32mb[0m
[34m[2023-01-28:19:05:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:05:56] S3DistributionType set as FullyReplicated[0m
[34m[19:05:56] 15129x16 matrix with 242064 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-28:19:05:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:05:56] S3DistributionType set as FullyReplicated[0m
[34m[19:05:56] 6484x16 matrix with 103744 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimi

In [49]:
xgboost_regressor_tuning = xgboost_tuning.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [50]:
xgboost_regressor_tuning.serializer = CSVSerializer()
previsoes = np.array(xgboost_regressor_tuning.predict(X_teste).decode('utf-8').split(',')).astype(np.float32)

In [None]:
# avaliando as metricas

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_errory_teste,previsoes).round(4)
R2  = r2_score(y_teste,previsoes).round(4)
EQM = mean_squared_error(y_teste,previsoes).round(4) #mais proximo de zero, melhor
REQM  = np.sqrt(EQM).round(4) #calcula a distancia de um ponto em relação a linha, logo verifica a concentração dos dados proximo a linha ajustada
print('O valor de R² para esse modelo XGBoost Otimizado é {}'.format(R2.round(4)))
print('O valor de MAE para esse modelo XGboost Otimizado é {}'.format(R2.round(4)))
print('O valor de EQM para esse modelo XGBoost Otimizado é {}'.format(EQM))
print('O valor de REQM para esse modelo XGBoost Otimizado é {}'.format(REQM))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y_teste, previsoes)
mse = mean_squared_error(y_teste, previsoes)
print('MAE = ', mae, '\nMSE = ', mse)