# DeepAR Model Training

In [13]:
import boto3
import sagemaker
import csv
import random
import json
import pandas as pd
import numpy as np

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

## Get train/test dataframes from S3 

In [22]:
!aws s3 cp --recursive s3://aurelia-resort-data/model_train/data_csv ./data

download: s3://aurelia-resort-data/model_train/data_csv/train_nans.csv to data/train_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/train.csv to data/train.csv
download: s3://aurelia-resort-data/model_train/data_csv/test_nans.csv to data/test_nans.csv
download: s3://aurelia-resort-data/model_train/data_csv/test.csv to data/test.csv


In [23]:
train = pd.read_csv("data/train_nans.csv")
test = pd.read_csv("data/test_nans.csv")

In [27]:
train['Date'] = pd.to_datetime(train['Date'])
train.tail(5)

Unnamed: 0,Date,num_stays,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,income_total
152,2022-02-01,4413,830.0,2139952.0,260109800.0,82.156684,1210049000.0,15125.6
153,2022-03-01,7754,896.0,3059442.0,262116700.0,81.897555,1218480000.0,15064.1
154,2022-04-01,9208,858.0,3272280.0,261462800.0,81.906942,1279973000.0,15055.2
155,2022-05-01,10146,902.0,3415990.0,261054000.0,82.059921,1305916000.0,15036.4
156,2022-06-01,8271,938.0,3298350.0,260186900.0,81.819188,1236950000.0,14973.1


In [28]:
test['Date'] = pd.to_datetime(test['Date'])
test.head(5)

Unnamed: 0,Date,num_stays,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,income_total
0,2022-07-01,8002,,,,,,15100.2
1,2022-08-01,7866,,,,,,15149.6
2,2022-09-01,8091,,,,,,15172.2
3,2022-10-01,9588,,,,,,15274.2
4,2022-11-01,6964,,,,,,15332.9


In [29]:
# set random seeds for reproducibility
np.random.seed(100)
random.seed(100)

## Create JSON Object for DeepAR Training Job

In [47]:
#  Create JSON object by grouping train set by Date
train_json = []
for Date, group in train.groupby('Date'):
    item = {
        'start': Date.isoformat(),
        'target': group[['TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total']].values.tolist()
        #'dynamic_feat': group.drop(['Date', 'TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total'], axis=1).values.tolist()
    }
    train_json.append(item)
    
    
# Create JSON object by grouping test set by Date
test_json = []
for Date, group in test.groupby('Date'):
    item = {
        'start': Date.isoformat(),
        'target': group[['TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total']].values.tolist()
        #'dynamic_feat': group.drop(['Date', 'TotalAirlineTripstoDC', 'TotalAirlinePassengerstoDC', 'TotalAmericanTravelers', 'PercentofAmericanswhoTraveled', 'TotalTripsbyAmericans', 'num_stays', 'income_total'], axis=1).values.tolist()
    }
    test_json.append(item)

In [48]:
print(len(train_json))
print(len(test_json))

157
6


In [49]:
train_json[1]

{'start': '2009-06-01T00:00:00',
 'target': [[nan, nan, nan, nan, nan, 2.0, nan]]}

In [50]:
def write_dicts_to_file(path, data):
    with open(path, "wb") as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))

In [51]:
%%time
write_dicts_to_file("train.json", train_json)
write_dicts_to_file("test.json", test_json)

CPU times: user 3.4 ms, sys: 206 Âµs, total: 3.61 ms
Wall time: 43.4 ms


## Train Model with DeepAR

In [52]:
s3_data_path = "s3://aurelia-resort-data/model_train/deepAR"

In [60]:
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sagemaker_session,
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    base_job_name="deepar-hotel",
    output_path="s3://aurelia-resort-data/model_train/deepAR/output/",
    hyperparameters={
        "time_freq": "M",
        "prediction_length": "2",
        "context_length": "3",
        "num_cells": "40",
        "num_layers": "2",
        "likelihood": "student-t",
        "epochs": "200"
    }
)

In [61]:
%%time
data_channels = {"train": "{}/train".format(s3_data_path), "test": "{}/test".format(s3_data_path)}

estimator.fit(inputs=data_channels, wait=True)

INFO:sagemaker:Creating training-job with name: deepar-hotel-2023-04-03-23-32-55-031


2023-04-03 23:33:00 Starting - Starting the training job...
2023-04-03 23:33:26 Starting - Preparing the instances for training......
2023-04-03 23:34:18 Downloading - Downloading input data...
2023-04-03 23:34:44 Training - Downloading the training image...
2023-04-03 23:35:34 Training - Training image download completed. Training in progress...[34mArguments: train[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
  from collections import Mapping, MutableMapping, Sequence[0m
[34m[04/03/2023 23:35:43 INFO 140244386916160] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': '128', 'num_cells': '40', 'num_dynamic_feat': 'auto', 'num_eval_sam

## Creating Endpoint to make predictions on newly fitted model

In [65]:
job_name = estimator.latest_training_job.name

endpoint_name = sagemaker_session.endpoint_from_job(
    job_name=job_name,
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    image_uri=image_name,
    role=role,
)

INFO:sagemaker:Creating model with name: deepar-hotel-2023-04-03-23-32-55-031
INFO:sagemaker:Creating endpoint-config with name deepar-hotel-2023-04-03-23-32-55-031
INFO:sagemaker:Creating endpoint with name deepar-hotel-2023-04-03-23-32-55-031


-------!

In [66]:
class DeepARPredictor(sagemaker.predictor.RealTimePredictor):
    def set_prediction_parameters(self, freq, prediction_length):
        """Set the time frequency and prediction length parameters. This method **must** be called
        before being able to use `predict`.

        Parameters:
        freq -- string indicating the time frequency
        prediction_length -- integer, number of predicted time points

        Return value: none.
        """
        self.freq = freq
        self.prediction_length = prediction_length

    def predict(
        self,
        ts,
        cat=None,
        encoding="utf-8",
        num_samples=100,
        quantiles=["0.1", "0.5", "0.9"],
        content_type="application/json",
    ):
        """Requests the prediction of for the time series listed in `ts`, each with the (optional)
        corresponding category listed in `cat`.

        Parameters:
        ts -- list of `pandas.Series` objects, the time series to predict
        cat -- list of integers (default: None)
        encoding -- string, encoding to use for the request (default: "utf-8")
        num_samples -- integer, number of samples to compute at prediction time (default: 100)
        quantiles -- list of strings specifying the quantiles to compute (default: ["0.1", "0.5", "0.9"])

        Return value: list of `pandas.DataFrame` objects, each containing the predictions
        """
        prediction_times = [x.index[-1] + pd.Timedelta(1, unit=self.freq) for x in ts]
        req = self.__encode_request(ts, cat, encoding, num_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req, initial_args={"ContentType": content_type})
        return self.__decode_response(res, prediction_times, encoding)

    def __encode_request(self, ts, cat, encoding, num_samples, quantiles):
        instances = [series_to_obj(ts[k], cat[k] if cat else None) for k in range(len(ts))]
        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles"],
            "quantiles": quantiles,
        }
        http_request_data = {"instances": instances, "configuration": configuration}
        return json.dumps(http_request_data).encode(encoding)

    def __decode_response(self, response, prediction_times, encoding):
        response_data = json.loads(response.decode(encoding))
        list_of_df = []
        for k in range(len(prediction_times)):
            prediction_index = pd.date_range(
                start=prediction_times[k], freq=self.freq, periods=self.prediction_length
            )
            list_of_df.append(
                pd.DataFrame(
                    data=response_data["predictions"][k]["quantiles"], index=prediction_index
                )
            )
        return list_of_df

In [67]:
predictor = DeepARPredictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)
predictor.set_prediction_parameters('M', 3)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [72]:
list_of_df = predictor.predict(train_json[:5], content_type="application/json")
actual_data = train[:5]

AttributeError: 'dict' object has no attribute 'index'

In [None]:
for k in range(len(list_of_df)):
    plt.figure(figsize=(12, 6))
    actual_data[k][-prediction_length - context_length :].plot(label="target")
    p10 = list_of_df[k]["0.1"]
    p90 = list_of_df[k]["0.9"]
    plt.fill_between(p10.index, p10, p90, color="y", alpha=0.5, label="80% confidence interval")
    list_of_df[k]["0.5"].plot(label="prediction median")
    plt.legend()
    plt.show()

## Delete Endpoint

In [73]:
sagemaker_session.delete_endpoint(endpoint_name)

INFO:sagemaker:Deleting endpoint with name: deepar-hotel-2023-04-03-23-32-55-031


## Release Resources

In [74]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}