### XGBOOST Forecasting

In [236]:
xgb_image_uri = sagemaker.image_uris.retrieve("xgboost", region="us-east-1", version="latest")

print(xgb_image_uri)

811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest


In [237]:
import boto3
import sagemaker
import csv
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.amazon.amazon_estimator import get_image_uri

# xgb_estimator = sagemaker.estimator.Estimator(
#     image_uri='811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
#     role=sagemaker.get_execution_role(),
#     instance_count=1,
#     instance_type='ml.m5.large',
#     output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
#     sagemaker_session=sagemaker_session,
#     framework_version='1.3-1'
# )

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [238]:
container = get_image_uri(region_name=region, 
                          repo_name='xgboost', 
                          repo_version='latest')

xgb_estimator = sagemaker.estimator.Estimator(container,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=1,
                                              instance_type='ml.m4.xlarge',
                                              output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
                                              sagemaker_session=sagemaker_session,)

xgb_estimator.set_hyperparameters(max_depth=5,
                                  eta=0.2,
                                  gamma=4,
                                  min_child_weight=6,
                                  subsample=0.8,
                                  objective='reg:squarederror',
                                  num_round=100)

# xgb_estimator.fit(inputs={'train': train_data})

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Get train/test dataframes from S3 

In [239]:
!aws s3 cp --recursive s3://aurelia-resort-data/model_train/data_csv/ ./data

download: s3://aurelia-resort-data/model_train/data_csv/train.csv to data/train.csv
download: s3://aurelia-resort-data/model_train/data_csv/test_nans.csv to data/test_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/train_nans.csv to data/train_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/test.csv to data/test.csv


In [240]:
train = pd.read_csv('data/train.csv',  header=None)
test = pd.read_csv('data/test.csv', header=None)

In [241]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
1,2019-01-01,435,1326271.0,265224073.85258028,81.44242073648299,1137041852.3226352,2899,14791.2
2,2019-02-01,385,1261292.0,266240835.73305443,81.77432628778035,1148666757.7153966,2639,14835.3
3,2019-03-01,422,1668002.0,265091945.14415318,81.79908504520232,1306564567.8397522,5226,14843.9
4,2019-04-01,427,1681832.0,264062598.41495603,81.35434739932987,1446882218.0239325,5608,14811.8


In [242]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
1,2022-01-01,904,1893198.0,259584735.82010934,82.00471558566778,1243775232.8225882,4356,15137.7
2,2022-02-01,830,2139952.0,260109791.61199838,82.15668408700547,1210049335.9942613,4413,15125.6
3,2022-03-01,896,3059442.0,262116683.06916347,81.89755485680998,1218479734.1890206,7754,15064.1
4,2022-04-01,858,3272280.0,261462761.25324675,81.90694228417173,1279973330.1118326,9208,15055.2


In [243]:
train= train.drop(0)
train.head()



Unnamed: 0,0,1,2,3,4,5,6,7
1,2019-01-01,435,1326271.0,265224073.85258028,81.44242073648299,1137041852.3226352,2899,14791.2
2,2019-02-01,385,1261292.0,266240835.73305443,81.77432628778035,1148666757.7153966,2639,14835.3
3,2019-03-01,422,1668002.0,265091945.1441532,81.79908504520232,1306564567.8397522,5226,14843.9
4,2019-04-01,427,1681832.0,264062598.41495603,81.35434739932987,1446882218.0239325,5608,14811.8
5,2019-05-01,439,1766148.0,266133995.63478008,81.884669739372,1415056833.4654295,6110,14814.7


In [244]:
test=test.drop(0)
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7
1,2022-01-01,904,1893198.0,259584735.8201093,82.00471558566778,1243775232.8225882,4356,15137.7
2,2022-02-01,830,2139952.0,260109791.6119984,82.15668408700547,1210049335.9942613,4413,15125.6
3,2022-03-01,896,3059442.0,262116683.06916347,81.89755485680998,1218479734.1890206,7754,15064.1
4,2022-04-01,858,3272280.0,261462761.25324675,81.90694228417173,1279973330.1118326,9208,15055.2
5,2022-05-01,902,3415990.0,261053979.76624453,82.05992072229897,1305916298.96838,10146,15036.4


In [245]:
train.to_csv('train_new.csv', index=False)

train_new = 'train_new.csv'

In [246]:
test.to_csv('test_new.csv', index=False)

test_new ='test_new.csv'

In [247]:
bucket_name = 'aurelia-resort-data'
s3_prefix = 'XGboost Forecast'

train_data_location = sagemaker_session.upload_data('train_new.csv', bucket=bucket_name, key_prefix=s3_prefix+'/train')
validation_data_location = sagemaker_session.upload_data('test_new.csv', bucket=bucket_name, key_prefix=s3_prefix+'/test')



In [248]:
# hyperparameters = {
#     'max_depth': '5',
#     'eta': '0.1',
#     'gamma': '1',
#     'min_child_weight': '1',
#     'subsample': '0.8',
#     'objective': 'reg:squarederror',
#     'num_round': '50'
# }

In [249]:
# xgb_estimator.set_hyperparameters(
#     objective='reg:squarederror',
#     max_depth=5,
#     eta=0.2,
#     gamma=4,
#     min_child_weight=6,
#     subsample=0.8,
#     silent=0,
#     num_round=100
# )



In [250]:
# train_input = sagemaker.inputs.TrainingInput(
#     f's3://aurelia-resort-data/model_train/data_csv/train.csv', 
#     content_type='csv'
# )

In [251]:
# train_input = sagemaker.inputs.TrainingInput(
#     f's3://aurelia-resort-data/XGboost Forecast/train/train_new.csv', 
#     content_type='csv'
# )

container = get_image_uri(region_name=region, 
                          repo_name='xgboost', 
                          repo_version='latest')

xgb_estimator = sagemaker.estimator.Estimator(container,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=1,
                                              instance_type='ml.m4.xlarge',
                                              output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
                                              sagemaker_session=sagemaker_session,)

xgb_estimator.set_hyperparameters(max_depth=5,
                                  eta=0.2,
                                  gamma=4,
                                  min_child_weight=6,
                                  subsample=0.8,
                                  objective='reg:linear',
                                  num_round=100)

train_input = sagemaker.inputs.TrainingInput(
    f's3://aurelia-resort-data/XGboost Forecast/train/train_new.csv', 
    content_type='csv'
)


test_input = sagemaker.inputs.TrainingInput(
    f's3://aurelia-resort-data/XGboost Forecast/test/test_new.csv', 
    content_type='csv'
)


The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [252]:
xgb_estimator.fit(inputs={'train': train_input, 'validation': test_input})

# xgb_estimator.fit({'train': train_input})



2023-04-03 23:25:52 Starting - Starting the training job...
2023-04-03 23:26:17 Starting - Preparing the instances for trainingProfilerReport-1680564352: InProgress
.........
2023-04-03 23:27:37 Downloading - Downloading input data...
2023-04-03 23:28:17 Training - Downloading the training image...
2023-04-03 23:28:48 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-04-03:23:29:02:INFO] Running standalone xgboost training.[0m
[34m[2023-04-03:23:29:02:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8606.14mb[0m
[34m[2023-04-03:23:29:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:29:02] S3DistributionType set as FullyReplicated[0m
[34m[23:29:02] 25x7 matrix with 175 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-04-03:23:29:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:29:02] S3DistributionType set a