# Step 0: Initial setup

**NOTE:** 
* Following tasks must be performed each time the notebook instance is started.
* This is **NOT** required when the Kernel is restarted.

**NOTE:** Installation of dependencies may take up to a few minutes

In [None]:
import time

start = time.perf_counter()

%pip install pyod jupyter_bokeh

end = time.perf_counter()
print(f"Time: {end - start:0.4f} seconds")

# Step 1: Basic setup

**NOTE:** Start here when you restarted your Kernel **ONLY**.

## Step 1.1: Imports

In [None]:
import sys
import os
from datetime import datetime
from pathlib import Path

# importing forecast notebook utility from notebooks/common directory
sys.path.insert(0, os.path.abspath("./common/"))
import util
import util.fcst_utils

%reload_ext autoreload
import boto3
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams['figure.figsize'] = (15.0, 5.0)

pd.options.mode.copy_on_write = True

from bokeh.io import output_notebook
from bokeh.plotting import show

output_notebook()

## Step 1.2: Setup variables

In [None]:
# Resource tag values
owner = "martin.macecek@rearc.io"
type = "Internal"
usage = "Playground"

# Custom variables
prefix = "mac-training"
role_name = f"{prefix}-forecasting"
bucket_name = f"{prefix}-bucket-275279264324-us-east-1"
path = "forecasting/input"
data_key = f"{path}/RIVN.csv"
future_data_key = f"{path}/RIVN_20240513.csv"
prepared_data_key_prefix = "forecasting/prepared/rivn"
item_id = "RIVN"
target_column_name = "close"
domain="RETAIL"

# Setup more variables
s3_target_data_key = f"s3://{bucket_name}/{prepared_data_key_prefix}.csv"
s3_related_data_key = f"s3://{bucket_name}/{prepared_data_key_prefix}_rts.csv"
date_format = '%Y%m%d_%H%M%S'
ui_date_format = '%a, %d %b %Y %H:%M:%S %Z'

# Tags for resource tagging
tags = [{'Key': 'Owner', 'Value': owner},
        {'Key': 'Type', 'Value': type},
        {'Key': 'Usage', 'Value': usage}]

# Forecast
# Forecast length in days (Units is defined below)
FORECAST_LENGTH = 30

# What is your forecast time unit granularity?
# Choices are: ^Y|M|W|D|h|30min|15min|10min|5min|1min$
DATASET_FREQUENCY = "D"
DATASET_TIMESTAMP_FORMAT = "yyyy-MM-dd"
# delimiter = ','

# What name do you want to give this project?
# We will use this same name for your Forecast Dataset Group name.
PROJECT = 'rivn-forecast'
DATA_VERSION = 9

# Whether or not to use other features of target dataet (e.g. high, low, open...)
USE_OTHER_FEATURES = True
# Whether or not to use bank days, which creates another features indicating a trading day or not
USE_BANK_DAY = False
# Whether or not to fill missing data (typically non-trading days (weekends and holidays))
FILL_MISSING_DATA = False

## Step 1.3: API connectivity

In [None]:
region = boto3.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')
print(f"Account: {account_id}, Region: {region}")

In [None]:
# Connect API sessions
session = boto3.Session(region_name=region) 
s3 = session.client(service_name='s3')
forecast = session.client(service_name='forecast')
forecastquery = session.client(service_name='forecastquery')

## Step 1.4: AWS resources

In [None]:
# Create or retrieve the role to provide to Amazon Forecast.
role_arn = util.get_or_create_iam_role(role_name=role_name)

# echo user inputs without account
print(f"Success! Role '{role_arn.split('/')[1]}' ready for use.")

In [None]:
util.get_or_create_bucket(bucket_name, region=region)
print(f"Success! Bucket '{bucket_name}' ready for use.")

# Step 2: Data preparation

In [None]:
stock_df = util.prepare_data(
    bucket_name,
    data_key,
    "%m/%d/%Y",
    target_column_name,
    item_id,
    fill_missing_values=FILL_MISSING_DATA,
    use_bank_day=USE_BANK_DAY)
stock_df.head()

In [None]:
stock_df.plot(x='timestamp', y=[target_column_name, 'open', 'high', 'low'], figsize=(15, 8))
plt.xlabel('Date Time')
plt.ylabel('Stock Price')
plt.show()

In [None]:
stock_df.timestamp.min()

In [None]:
related_data=util.get_related_data(
    s3_client=s3,
    bucket=bucket_name,
    prefix=path,
    target_df=stock_df,
    target_column_name=target_column_name,
    item_id=item_id,
    exclude=[data_key, future_data_key],
    fill_missing_values=FILL_MISSING_DATA,
    use_bank_day=USE_BANK_DAY,
    extra_features=USE_OTHER_FEATURES,
    start_date=stock_df.timestamp.min(),
    end_date=stock_df.timestamp.max()
)
related_data

# Step 3: Prepare and Save the Target Time Series

In [None]:
target_df = stock_df[["timestamp", "item_id", target_column_name]]
target_df.head(5)

In [None]:
rts_df = related_data
rts_df.head(5)

In [None]:
print(f"{len(target_df)} = {len(rts_df)}")
assert len(target_df) == len(rts_df), "length doesn't match"

In [None]:
target_df.to_csv(s3_target_data_key, index= False, header = False)
rts_df.to_csv(s3_related_data_key, index= False, header = False)

# Step 4: Create the Dataset Group and Dataset

## Dataset Group

In [None]:
dataset_group_name = f"{prefix}_{PROJECT}_{DATA_VERSION}".replace("-", "_")
print(f"Dataset Group Name = {dataset_group_name}")

In [None]:
dataset_arns = []
try:
    create_dataset_group_response = \
        forecast.create_dataset_group(Domain=domain,
                                      DatasetGroupName=dataset_group_name,
                                      DatasetArns=dataset_arns,
                                      Tags=tags
                                     )
    dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
    status = util.wait(lambda: forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn))
    assert status
except forecast.exceptions.ResourceAlreadyExistsException:
    dataset_group_arn = f"arn:aws:forecast:{region}:{account_id}:dataset-group/{dataset_group_name}"
    print(f"Dataset group {dataset_group_arn} already exists.")

## Target Schema

In [None]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
tts_schema = {
   "Attributes": util.get_schema_attributes(target_df, domain, target_column_name)
}
tts_schema

## Target Dataset

In [None]:
tts_dataset_name = f"{dataset_group_name}_tts"
print(tts_dataset_name)

In [None]:
try:
    create_dataset_tts_response = \
        forecast.create_dataset(Domain=domain,
                                DatasetType='TARGET_TIME_SERIES',
                                DatasetName=tts_dataset_name,
                                DataFrequency=DATASET_FREQUENCY,
                                Schema=tts_schema,
                                Tags=tags
                               )
    tts_dataset_arn = create_dataset_tts_response['DatasetArn']
    status = util.wait(lambda: forecast.describe_dataset(DatasetArn=tts_dataset_arn))
    assert status
except forecast.exceptions.ResourceAlreadyExistsException:
    tts_dataset_arn = f"arn:aws:forecast:{region}:{account_id}:dataset/{tts_dataset_name}"
    print(f"Target dataset {tts_dataset_arn} already exists.")

## Related schema

In [None]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
rts_schema = {
   "Attributes": util.get_schema_attributes(rts_df, domain, target_column_name)
}
rts_schema

## Related dataset

In [None]:
rts_dataset_name = f"{dataset_group_name}_rts"
print(rts_dataset_name)

In [None]:
try:
    create_dataset_rts_response = \
        forecast.create_dataset(Domain=domain,
                                DatasetType='RELATED_TIME_SERIES',
                                DatasetName=rts_dataset_name,
                                DataFrequency=DATASET_FREQUENCY,
                                Schema=rts_schema,
                                Tags=tags
                               )
    rts_dataset_arn = create_dataset_rts_response['DatasetArn']
    status = util.wait(lambda: forecast.describe_dataset(DatasetArn=rts_dataset_arn))
    assert status
except forecast.exceptions.ResourceAlreadyExistsException:
    rts_dataset_arn = f"arn:aws:forecast:{region}:{account_id}:dataset/{rts_dataset_name}"
    print(f"Related dataset {rts_dataset_arn} already exists.")

In [None]:
dataset_arns = []
dataset_arns.append(tts_dataset_arn)
dataset_arns.append(rts_dataset_arn)
update_dataset_response = forecast.update_dataset_group(DatasetGroupArn=dataset_group_arn, DatasetArns=dataset_arns)
status = util.wait(lambda: forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn))
assert status

# Step 5: Import data from S3 to Forecast

## Target data

**Note:** Depending on the data size, the import can take 10 mins or more to become **ACTIVE**.

In [None]:
import_tts_dataset_response = True
if len(util.get_dataset_import_jobs(tts_dataset_arn, forecast)) > 0:
    print("Target dataset has already imported data.")
    import_tts_dataset_response = True if input("Re-import (y/N)? ").lower() == "y" else False

if import_tts_dataset_response:
    tts_dataset_import_job_response = \
        forecast.create_dataset_import_job(DatasetImportJobName=f"tts_job_{datetime.now().strftime(date_format)}",
                                           DatasetArn=tts_dataset_arn,
                                           DataSource={
                                             "S3Config": {
                                                 "Path": s3_target_data_key,
                                                 "RoleArn": role_arn
                                             }
                                           },
                                           TimestampFormat=DATASET_TIMESTAMP_FORMAT,
                                           Tags=tags
                                          )
    tts_dataset_import_job_arn=tts_dataset_import_job_response['DatasetImportJobArn']
    status = util.wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=tts_dataset_import_job_arn))
    assert status
    print("Target data imported.")
else:
    print("Target data re-import skipped")

## Related data

**Note:** Depending on the data size, the import can take 10 mins or more to become **ACTIVE**.

In [None]:
import_rts_dataset_response = True
if len(util.get_dataset_import_jobs(rts_dataset_arn, forecast)) > 0:
    print("Related dataset has already imported data.")
    import_rts_dataset_response = True if input("Re-import (y/N)? ").lower() == "y" else False

if import_rts_dataset_response:
    rts_dataset_import_job_response = \
        forecast.create_dataset_import_job(DatasetImportJobName=f"rts_job_{datetime.now().strftime(date_format)}",
                                           DatasetArn=rts_dataset_arn,
                                           DataSource= {
                                             "S3Config" : {
                                                 "Path": s3_related_data_key,
                                                 "RoleArn": role_arn
                                             } 
                                           },
                                           TimestampFormat=DATASET_TIMESTAMP_FORMAT,
                                           Tags=tags
                                          )
    rts_dataset_import_job_arn = rts_dataset_import_job_response["DatasetImportJobArn"]
    status = util.wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=rts_dataset_import_job_arn))
    assert status
    print("Related data imported.")
else:
    print("Related data re-import skipped")

# Step 6: Training and Evaluation

## DeepAR+

In [None]:
algorithm_arn = 'arn:aws:forecast:::algorithm/'
algorithm = 'Deep_AR_Plus'
algorithm_arn_deep_ar_plus = algorithm_arn + algorithm
predictor_name_deep_ar = f"{dataset_group_name}_{algorithm.lower()}"
print(f"Predictor Name = {predictor_name_deep_ar}")

**NOTE:** Training a forecast model can take several hours to become **ACTIVE**.

In [None]:
retrain_predictor_deep_ar = False
existing_predictor_deep_ar = util.get_predictor(predictor_name_deep_ar, forecast)

if existing_predictor_deep_ar:
    predictor_arn_deep_ar = existing_predictor_deep_ar['PredictorArn']
    print(f"DeepAR+ Predictor {predictor_arn_deep_ar} already exists.")
    retrain_predictor_deep_ar = True if input("Retrain model (y/N)? ").lower() == "y" else False

if existing_predictor_deep_ar and retrain_predictor_deep_ar:
    util.delete_forecasts_by_predictor(predictor_arn_deep_ar, forecast)
    print(f"Deleting DeepAR+ Predictor {predictor_arn_deep_ar}...")
    util.wait_till_delete(lambda: forecast.delete_predictor(PredictorArn=predictor_arn_deep_ar))
elif existing_predictor_deep_ar and not retrain_predictor_deep_ar:
    print(f"Keeping existing DeepAR+ Predictor {predictor_arn_deep_ar}.")

if not existing_predictor_deep_ar or retrain_predictor_deep_ar:
    create_predictor_deep_ar_response = \
        forecast.create_predictor(PredictorName=predictor_name_deep_ar,
                                  AlgorithmArn=algorithm_arn_deep_ar_plus,
                                  ForecastHorizon=FORECAST_LENGTH,
                                  PerformAutoML=False,
                                  PerformHPO=False,
                                  InputDataConfig={
                                      "DatasetGroupArn": dataset_group_arn,
                                      "SupplementaryFeatures": [
                                          {"Name": "holiday",
                                           "Value": "US"}],
                                  },
                                  FeaturizationConfig={"ForecastFrequency": DATASET_FREQUENCY},
                                  Tags=tags
                                 )
    predictor_arn_deep_ar = create_predictor_deep_ar_response['PredictorArn']
    print(f"Creating DeepAR+ Predictor {predictor_arn_deep_ar}...")
    status = util.wait(lambda: forecast.describe_predictor(PredictorArn=predictor_arn_deep_ar))
    assert status

## Prophet

In [None]:
algorithm_arn = 'arn:aws:forecast:::algorithm/'
algorithm = 'Prophet'
algorithm_arn_prophet = algorithm_arn + algorithm
predictor_name_prophet = f"{dataset_group_name}_{algorithm.lower()}"
print(f"Predictor Name = {predictor_name_prophet}")

**NOTE:** Training a forecast model can take several hours to become **ACTIVE**.

In [None]:
retrain_predictor_prophet = False
existing_predictor_prophet = util.get_predictor(predictor_name_prophet, forecast)

if existing_predictor_prophet:
    predictor_arn_prophet = existing_predictor_prophet['PredictorArn']
    print(f"Prophet Predictor {predictor_arn_prophet} already exists.")
    retrain_predictor_prophet = True if input("Retrain model (y/N)? ").lower() == "y" else False

if existing_predictor_prophet and retrain_predictor_prophet:
    print(f"Deleting Prophet Predictor {predictor_arn_prophet}...")
    util.wait_till_delete(lambda: forecast.delete_predictor(PredictorArn=predictor_arn_prophet))
elif existing_predictor_prophet and not retrain_predictor_prophet:
    print(f"Keeping existing Prophet Predictor {predictor_arn_prophet}.")

if not existing_predictor_prophet or retrain_predictor_prophet:
    create_predictor_response = \
        forecast.create_predictor(PredictorName=predictor_name_prophet,
                                  AlgorithmArn=algorithm_arn_prophet,
                                  ForecastHorizon=FORECAST_LENGTH,
                                  PerformAutoML=False,
                                  PerformHPO=False,
                                  InputDataConfig= {
                                      "DatasetGroupArn": dataset_group_arn,
                                      "SupplementaryFeatures": [
                                          {"Name": "holiday",
                                           "Value": "US"}],
                                  },
                                  FeaturizationConfig={"ForecastFrequency": DATASET_FREQUENCY},
                                  Tags=tags
                                 )
    predictor_arn_prophet = create_predictor_response['PredictorArn']
    print(f"Creating Prophet Predictor {predictor_arn_prophet}...")
    status = util.wait(lambda: forecast.describe_predictor(PredictorArn=predictor_arn_prophet))
    assert status

## Auto

In [None]:
predictor_name_auto = f"{dataset_group_name}_auto"
print(f"Predictor Name = {predictor_name_auto}")

**NOTE:** Training a forecast model can take several hours to become **ACTIVE**.

In [None]:
existing_predictor_auto = util.get_predictor(predictor_name_auto, forecast)
if existing_predictor_auto:
    predictor_arn_auto = existing_predictor_auto['PredictorArn']
    print(f"Auto Predictor {predictor_arn_auto} already exists.")
    if input("Retrain model (y/N)? ").lower() == "y":
        args = {
            "PredictorName": f"{predictor_name_auto}_retrain_{datetime.now().strftime(date_format)}",
            "ReferencePredictorArn": predictor_arn_auto
        }
    else:
        args = {}
else:
    args = {
        "PredictorName": predictor_name_auto,
        "ForecastHorizon": FORECAST_LENGTH,
        "ForecastFrequency": DATASET_FREQUENCY,
        "DataConfig": {
            "DatasetGroupArn": dataset_group_arn,
            "AdditionalDatasets": [
                {
                    "Name": "holiday",
                    "Configuration": {
                        "CountryCode": ["US"]
                    }
                }
            ]
        },
        "Tags": tags
    }

if args:
    create_predictor_auto_response = forecast.create_auto_predictor(**args)
    predictor_arn_auto = create_predictor_auto_response['PredictorArn']
    print(f"{'Retraining existing' if existing_predictor_auto else 'Creating new'} Auto Predictor {predictor_arn_auto}...")
    status = util.wait(lambda: forecast.describe_auto_predictor(PredictorArn=predictor_arn_auto))
    assert status
else:
    print(f"Keeping existing Auto Predictor {predictor_arn_auto}.")

## Step 7: Predictor Error Metrics

In [None]:
error_metrics_deep_ar_plus = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_deep_ar)

In [None]:
error_metrics_prophet = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_prophet)

In [None]:
error_metrics_auto = forecast.get_accuracy_metrics(PredictorArn=predictor_arn_auto)

In [None]:
def extract_summary_metrics(metric_response, predictor_name):
    df = pd.DataFrame(metric_response['PredictorEvaluationResults']
                 [0]['TestWindows'][0]['Metrics']['WeightedQuantileLosses'])
    df['Predictor'] = predictor_name
    return df

In [None]:
deep_ar_metrics = extract_summary_metrics(error_metrics_deep_ar_plus, "DeepAR")
prophet_metrics = extract_summary_metrics(error_metrics_prophet, "Prophet")
auto_metrics = extract_summary_metrics(error_metrics_auto, "Auto")

In [None]:
pd.concat([deep_ar_metrics, prophet_metrics, auto_metrics]) \
    .pivot(index='Quantile', columns='Predictor', values='LossValue').plot.bar()

# Step 8: Forecasting

## DeepAR+

**NOTE:** Creating a forecast can take up to an hour to become **ACTIVE**.

In [None]:
forecast_arn_deep_ar = util.create_forecast(
    *util.create_forecast_name(dataset_group_name,
                               'deep_ar',
                               date_format=date_format
                              ),
    predictor_arn_deep_ar,
    forecast,
    tags,
    ui_date_format=ui_date_format,
)

## Prophet

**NOTE:** Creating a forecast can take up to an hour to become **ACTIVE**.

In [None]:
forecast_arn_prophet = util.create_forecast(
    *util.create_forecast_name(dataset_group_name,
                               'prophet',
                               date_format=date_format
                              ),
    predictor_arn_prophet,
    forecast,
    tags,
    ui_date_format=ui_date_format,
)

## Auto

**NOTE:** Creating a forecast can take up to an hour to become **ACTIVE**.

In [None]:
forecast_arn_auto = util.create_forecast(
    *util.create_forecast_name(dataset_group_name,
                               'auto',
                               date_format=date_format
                              ),
    predictor_arn_auto,
    forecast,
    tags,
    ui_date_format=ui_date_format,
)

# Step 9: Querying

In [None]:
exact = pd.read_csv(s3_target_data_key, header=None, thousands=',')
exact.columns = ['timestamp', 'item_id', target_column_name]
exact = exact.loc[exact['item_id'] == item_id]
exact

In [None]:
exact = util.load_exact_sol(s3_target_data_key, item_id, target_col_name=target_column_name)
exact

In [None]:
future_df = util.prepare_data(
    bucket_name,
    future_data_key,
    "%m/%d/%Y",
    target_column_name,
    item_id,
    fill_missing_values=FILL_MISSING_DATA,
    use_bank_day=USE_BANK_DAY,
    minimal=True)
future_df

## DeepAR+

In [None]:
forecasts_deep_ar = util.query_forecasts(
    util.get_relevant_forecasts(f"{prefix}_{PROJECT}_", DATA_VERSION, "deep_ar", forecast), item_id, forecastquery
)

In [None]:
forecast_deep_ar_dfs = util.query_results_to_dataframes(forecasts_deep_ar, fill_missing_values=FILL_MISSING_DATA)

### Plot using bokeh

In [None]:
for key in forecast_deep_ar_dfs:
    auto_plot = util.plot_bokeh_forecasts(
        forecast_deep_ar_dfs[key],
        exact,
        freq=f'1{DATASET_FREQUENCY}',
        forecastHorizon=FORECAST_LENGTH,
        time_back=30,
        future=future_df,
        target_col_name=target_column_name,
        reverse=True,
        title=f"Stock Price Forecast for {item_id} (Auto v{key})"
    )
    show(auto_plot)

## Prophet

In [None]:
forecasts_prophet = util.query_forecasts(
    util.get_relevant_forecasts(f"{prefix}_{PROJECT}_", DATA_VERSION, "prophet", forecast), item_id, forecastquery
)

In [None]:
forecast_prophet_dfs = util.query_results_to_dataframes(forecasts_prophet, fill_missing_values=FILL_MISSING_DATA)

### Plot using bokeh

In [None]:
for key in forecast_prophet_dfs:
    auto_plot = util.plot_bokeh_forecasts(
        forecast_prophet_dfs[key],
        exact,
        freq=f'1{DATASET_FREQUENCY}',
        forecastHorizon=FORECAST_LENGTH,
        time_back=30,
        future=future_df,
        target_col_name=target_column_name,
        reverse=True,
        title=f"Stock Price Forecast for {item_id} (Auto v{key})"
    )
    show(auto_plot)

## Auto

In [None]:
forecasts_auto = util.query_forecasts(
    util.get_relevant_forecasts(f"{prefix}_{PROJECT}_", DATA_VERSION, "auto", forecast), item_id, forecastquery
)

In [None]:
forecast_auto_dfs = util.query_results_to_dataframes(forecasts_auto, fill_missing_values=FILL_MISSING_DATA)

### Plot using bokeh

In [None]:
for key in forecast_auto_dfs:
    auto_plot = util.plot_bokeh_forecasts(
        forecast_auto_dfs[key],
        exact,
        freq=f'1{DATASET_FREQUENCY}',
        forecastHorizon=FORECAST_LENGTH,
        time_back=30,
        future=future_df,
        target_col_name=target_column_name,
        reverse=True,
        title=f"Stock Price Forecast for {item_id} (Auto v{key})"
    )
    show(auto_plot)

# Cleanup

## Datasets and Dataset Group

In [None]:
util.delete_dataset_group(dataset_group_arn, forecast)
# util.delete_dataset_group("arn:aws:forecast:us-east-1:275279264324:dataset-group/mac_training_rivn_forecast_2", forecast)
# util.delete_predictors("arn:aws:forecast:us-east-1:275279264324:dataset-group/mac_training_rivn_forecast_2", forecast)