### XGBOOST Forecasting

In [25]:
import boto3
import sagemaker
import csv
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.amazon.amazon_estimator import get_image_uri

# xgb_estimator = sagemaker.estimator.Estimator(
#     image_uri='811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
#     role=sagemaker.get_execution_role(),
#     instance_count=1,
#     instance_type='ml.m5.large',
#     output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
#     sagemaker_session=sagemaker_session,
#     framework_version='1.3-1'
# )

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [26]:
xgb_image_uri = sagemaker.image_uris.retrieve("xgboost", region="us-east-1", version="latest")

print(xgb_image_uri)

811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest


In [27]:
container = get_image_uri(region_name=region, 
                          repo_name='xgboost', 
                          repo_version='latest')

xgb_estimator = sagemaker.estimator.Estimator(container,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=1,
                                              instance_type='ml.m4.xlarge',
                                              output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
                                              sagemaker_session=sagemaker_session,)

xgb_estimator.set_hyperparameters(max_depth=5,
                                  eta=0.2,
                                  gamma=4,
                                  min_child_weight=6,
                                  subsample=0.8,
                                  objective='reg:squarederror',
                                  num_round=100)

# xgb_estimator.fit(inputs={'train': train_data})

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Get train/test dataframes from S3 

In [28]:
!aws s3 cp --recursive s3://aurelia-resort-data/model_train/data_csv/ ./data

download: s3://aurelia-resort-data/model_train/data_csv/train.csv to data/train.csv
download: s3://aurelia-resort-data/model_train/data_csv/test_nans.csv to data/test_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/test.csv to data/test.csv
download: s3://aurelia-resort-data/model_train/data_csv/train_nans.csv to data/train_nans.csv


In [29]:
train = pd.read_csv('data/train.csv',  header=None)
test = pd.read_csv('data/test.csv', header=None)

In [30]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
1,2019-01-01,435,1326271.0,791536531.8525803,81.42464086737007,3411167477.161345,2899,14791.2
2,2019-02-01,385,1261292.0,794586299.0187687,81.75514511420499,3446039943.7153964,2639,14835.3
3,2019-03-01,422,1668002.0,791140769.53125,81.7730402750601,3919756293.4526553,5226,14843.9
4,2019-04-01,427,1681832.0,788060775.614956,81.32895739711851,4340727403.623933,5608,14811.8


In [31]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
1,2022-01-01,452,946599.0,772011110.2072061,81.95210153477797,3852407610.435491,4356,15137.7
2,2022-02-01,415,1069976.0,773009379.8977127,82.09882819173558,3757569179.779976,4413,15125.6
3,2022-03-01,448,1529721.0,776089362.1014216,81.84397060004146,3843540127.543859,7754,15064.1
4,2022-04-01,429,1636140.0,776088311.58658,81.8549480812831,3993424870.911833,9208,15055.2


In [32]:
train= train.drop(0)
train.head()



Unnamed: 0,0,1,2,3,4,5,6,7
1,2019-01-01,435,1326271.0,791536531.8525803,81.42464086737007,3411167477.161345,2899,14791.2
2,2019-02-01,385,1261292.0,794586299.0187687,81.75514511420499,3446039943.7153964,2639,14835.3
3,2019-03-01,422,1668002.0,791140769.53125,81.7730402750601,3919756293.4526553,5226,14843.9
4,2019-04-01,427,1681832.0,788060775.614956,81.32895739711851,4340727403.623933,5608,14811.8
5,2019-05-01,439,1766148.0,794267531.6992962,81.86138849687315,4245220637.013816,6110,14814.7


In [33]:
test=test.drop(0)
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7
1,2022-01-01,452,946599.0,772011110.2072061,81.95210153477797,3852407610.435491,4356,15137.7
2,2022-02-01,415,1069976.0,773009379.8977127,82.09882819173558,3757569179.779976,4413,15125.6
3,2022-03-01,448,1529721.0,776089362.1014216,81.84397060004146,3843540127.543859,7754,15064.1
4,2022-04-01,429,1636140.0,776088311.58658,81.8549480812831,3993424870.911833,9208,15055.2
5,2022-05-01,451,1707995.0,777826533.8307607,82.01059959761147,4021791218.8393474,10146,15036.4


In [34]:
train.to_csv('train_new.csv', index=False)

train_new = 'train_new.csv'

In [35]:
test.to_csv('test_new.csv', index=False)

test_new ='test_new.csv'

In [36]:
bucket_name = 'aurelia-resort-data'
s3_prefix = 'XGboost Forecast'

train_data_location = sagemaker_session.upload_data('train_new.csv', bucket=bucket_name, key_prefix=s3_prefix+'/train')
validation_data_location = sagemaker_session.upload_data('test_new.csv', bucket=bucket_name, key_prefix=s3_prefix+'/test')



In [37]:
# hyperparameters = {
#     'max_depth': '5',
#     'eta': '0.1',
#     'gamma': '1',
#     'min_child_weight': '1',
#     'subsample': '0.8',
#     'objective': 'reg:squarederror',
#     'num_round': '50'
# }

In [38]:
# xgb_estimator.set_hyperparameters(
#     objective='reg:squarederror',
#     max_depth=5,
#     eta=0.2,
#     gamma=4,
#     min_child_weight=6,
#     subsample=0.8,
#     silent=0,
#     num_round=100
# )



In [39]:
# train_input = sagemaker.inputs.TrainingInput(
#     f's3://aurelia-resort-data/model_train/data_csv/train.csv', 
#     content_type='csv'
# )

In [40]:
# train_input = sagemaker.inputs.TrainingInput(
#     f's3://aurelia-resort-data/XGboost Forecast/train/train_new.csv', 
#     content_type='csv'
# )

container = get_image_uri(region_name=region, 
                          repo_name='xgboost', 
                          repo_version='latest')

xgb_estimator = sagemaker.estimator.Estimator(container,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=1,
                                              instance_type='ml.m4.xlarge',
                                              output_path=f's3://aurelia-resort-data/model_train/XGboost Forecast//output',
                                              sagemaker_session=sagemaker_session,)

xgb_estimator.set_hyperparameters(max_depth=5,
                                  eta=0.2,
                                  gamma=4,
                                  min_child_weight=6,
                                  subsample=0.8,
                                  objective='reg:linear',
                                  num_round=100)

train_input = sagemaker.inputs.TrainingInput(
    f's3://aurelia-resort-data/XGboost Forecast/train/train_new.csv', 
    content_type='csv'
)


test_input = sagemaker.inputs.TrainingInput(
    f's3://aurelia-resort-data/XGboost Forecast/test/test_new.csv', 
    content_type='csv'
)


The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [41]:
xgb_estimator.fit(inputs={'train': train_input, 'validation': test_input})

# xgb_estimator.fit({'train': train_input})



2023-04-10 06:07:36 Starting - Starting the training job...
2023-04-10 06:08:02 Starting - Preparing the instances for trainingProfilerReport-1681106856: InProgress
.........
2023-04-10 06:09:29 Downloading - Downloading input data...
2023-04-10 06:10:01 Training - Downloading the training image...
2023-04-10 06:10:30 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-04-10:06:10:45:INFO] Running standalone xgboost training.[0m
[34m[2023-04-10:06:10:45:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8607.06mb[0m
[34m[2023-04-10:06:10:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[06:10:45] S3DistributionType set as FullyReplicated[0m
[34m[06:10:45] 37x7 matrix with 259 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-04-10:06:10:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[06:10:45] S3DistributionType set a