# DeepAR Model Training

In [5]:
import boto3
import sagemaker
import csv
import random
import json
import pandas as pd
import numpy as np

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

## Get train/test dataframes from S3 

In [3]:
!aws s3 cp --recursive s3://aurelia-resort-data/model_train/data_csv ./data

download: s3://aurelia-resort-data/model_train/data_csv/test.csv to data/test.csv
download: s3://aurelia-resort-data/model_train/data_csv/train.csv to data/train.csv
download: s3://aurelia-resort-data/model_train/data_csv/test_nans.csv to data/test_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/train_nans.csv to data/train_nans.csv


In [6]:
train = pd.read_csv("data/train_nans.csv")
test = pd.read_csv("data/test_nans.csv")

In [7]:
train['Date'] = pd.to_datetime(train['Date'])
train.tail(5)

          Date  num_stays  TotalAirlineTripstoDC  TotalAirlinePassengerstoDC  \
152 2022-02-01       4413                  415.0                   1069976.0   
153 2022-03-01       7754                  448.0                   1529721.0   
154 2022-04-01       9208                  429.0                   1636140.0   
155 2022-05-01      10146                  451.0                   1707995.0   
156 2022-06-01       8271                  469.0                   1649175.0   

     TotalAmericanTravelers  PercentofAmericanswhoTraveled  \
152            7.730094e+08                      82.098828   
153            7.760894e+08                      81.843971   
154            7.760883e+08                      81.854948   
155            7.778265e+08                      82.010600   
156            7.739908e+08                      81.766838   

     TotalTripsbyAmericans  income_total  
152           3.757569e+09       15125.6  
153           3.843540e+09       15064.1  
154           3.9

In [8]:
test['Date'] = pd.to_datetime(test['Date'])
test.head(5)

        Date  num_stays  TotalAirlineTripstoDC  TotalAirlinePassengerstoDC  \
0 2022-07-01       8002                    NaN                         NaN   
1 2022-08-01       7866                    NaN                         NaN   
2 2022-09-01       8091                    NaN                         NaN   
3 2022-10-01       9588                    NaN                         NaN   
4 2022-11-01       6964                    NaN                         NaN   

   TotalAmericanTravelers  PercentofAmericanswhoTraveled  \
0                     NaN                            NaN   
1                     NaN                            NaN   
2                     NaN                            NaN   
3                     NaN                            NaN   
4                     NaN                            NaN   

   TotalTripsbyAmericans  income_total  
0                    NaN       15100.2  
1                    NaN       15149.6  
2                    NaN       15172.2  
3     

In [9]:
# set random seeds for reproducibility
np.random.seed(100)
random.seed(100)

## Create JSON Object for DeepAR Training Job

In [10]:
#  Create JSON object by grouping train set by Date
train_json = []
for Date, group in train.groupby('Date'):
    item = {
        'start': Date.isoformat(),
        'target': group[['TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total']].values.tolist()
        #'dynamic_feat': group.drop(['Date', 'TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total'], axis=1).values.tolist()
    }
    train_json.append(item)
    
    
# Create JSON object by grouping test set by Date
test_json = []
for Date, group in test.groupby('Date'):
    item = {
        'start': Date.isoformat(),
        'target': group[['TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total']].values.tolist()
        #'dynamic_feat': group.drop(['Date', 'TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total'], axis=1).values.tolist()
    }
    test_json.append(item)

In [11]:
print(len(train_json))
print(len(test_json))

157
6


In [12]:
train_json[1]

{'start': '2009-06-01T00:00:00',
 'target': [[nan, nan, nan, nan, nan, 2.0, nan]]}

In [11]:
def write_dicts_to_file(path, data):
    with open(path, "wb") as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))

In [12]:
%%time
write_dicts_to_file("train.json", train_json)
write_dicts_to_file("test.json", test_json)

CPU times: user 3.2 ms, sys: 0 ns, total: 3.2 ms
Wall time: 39.2 ms


## Train Model with DeepAR

In [17]:
s3_data_path = "s3://aurelia-resort-data/model_train/deepAR"

In [18]:
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sagemaker_session,
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    base_job_name="deepar-hotel",
    output_path="s3://aurelia-resort-data/model_train/deepAR/output/",
    hyperparameters={
        "time_freq": "M",
        "prediction_length": "5",
        "context_length": "2",
        "num_cells": "40",
        "num_layers": "2",
        "likelihood": "student-t",
        "epochs": "100"
    }
)

In [19]:
%%time
data_channels = {"train": "{}/train".format(s3_data_path), "test": "{}/test".format(s3_data_path)}

estimator.fit(inputs=data_channels, wait=True)

2023-04-09 19:07:47 Starting - Starting the training job...
2023-04-09 19:08:14 Starting - Preparing the instances for training......
2023-04-09 19:09:18 Downloading - Downloading input data...
2023-04-09 19:09:38 Training - Downloading the training image.........
2023-04-09 19:11:08 Training - Training image download completed. Training in progress.[34mArguments: train[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  from collections import Mapping, MutableMapping, Sequence[0m
[34m[04/09/2023 19:11:18 INFO 139724334417728] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': '128', 'num_cells': '40', 'num_dynamic_feat': 'auto', 'num_eval

## Creating Endpoint to make predictions on newly fitted model

In [20]:
job_name = estimator.latest_training_job.name

endpoint_name = sagemaker_session.endpoint_from_job(
    job_name=job_name,
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    image_uri=image_name,
    role=role,
)

--------!

In [61]:
runtime_client = boto3.client('sagemaker-runtime')

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=b'{"instances": [{"start": "2021-10-01T00:00:00", "target": [487.0, 1354033.0, 768745602.4769465, 81.54286336132316, 3866649606.462566, 6618.0, 15584.9]}, {"start": "2021-11-01T00:00:00", "target": [472.0, 1444907.0, 772187761.0169934, 81.41902126355457, 4027304584.965656, 5586.0, 15543.5]},{"start": "2021-12-01T00:00:00", "target": [485.0, 1395610.0, 773618808.1795698, 82.06188448788464, 3984239441.91169, 5147.0, 15483.6]}]}'
)

In [62]:
result = json.loads(response['Body'].read().decode())

predicted_values = result['predictions']
print('Predicted values:', predicted_values)

Predicted values: [{'mean': [-3852.9028320312, -4765.8583984375, -11023.3310546875, -3289.630859375, 1570.6770019531]}, {'mean': [-3667.1015625, -4535.9848632812, -10483.7138671875, -3124.5859375, 1494.2780761719]}, {'mean': [-3580.4362792969, -4428.7143554688, -10241.986328125, -3052.8334960938, 1459.1770019531]}]


## Delete Endpoint

In [63]:
sagemaker_session.delete_endpoint(endpoint_name)

## Release Resources

In [64]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}