## データセットのダウンロード

In [None]:
import urllib.request
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx", "online_retail.xlsx")

In [None]:
import pandas as pd
df = pd.read_excel("online_retail.xlsx", parse_dates=["InvoiceDate"])

## データセットの確認

In [None]:
from IPython.core.display import display
display(df)

## 不要な行の削除

In [None]:
del df["InvoiceNo"], df["Description"], df["CustomerID"]

## トップ5の抽出

In [None]:
top_five_code = df["StockCode"].value_counts().head().index
top_five_record = df[df["StockCode"].isin(top_five_code)]
top_five_record.set_index(top_five_record["InvoiceDate"], drop=True, inplace=True)
del top_five_record["InvoiceDate"]
display(top_five_record.head())
#top_five_record.groupby("stock")

## 1時間おきに揃える

In [None]:
qty = top_five_record.groupby(["StockCode", "Country"]).resample('H').sum()["Quantity"]
price = top_five_record.groupby(["StockCode", "Country"]).resample('H').mean()["UnitPrice"]
resampled = pd.concat([qty, price],axis=1, join="inner")
display(resampled.head())

## 欠損値を最頻値で埋める

In [None]:
import numpy as np
resampled["UnitPrice"] = resampled.groupby(['StockCode','Country'])['UnitPrice'].apply(lambda x: x.fillna(x.mode()[0]))
display(resampled.head())

## 入力に利用するjsonファイルを利用する

In [None]:
stockcode_arr = resampled.index.levels[0]
country_arr = resampled.index.levels[1]

json_data = []
for stock_index in range(len(stockcode_arr)):
    for country_index in range(len(country_arr)):
        one_data ={}
        try:
            record = resampled.loc[stockcode_arr[stock_index],country_arr[country_index]]
            #print(record)
            one_data['start'] = str(record.index[0])
            one_data['target'] = record["Quantity"].values.tolist()
            one_data['cat'] = [stock_index, country_index]
            json_data.append(one_data)
        except:
            pass

import json
with open("./training_data.json", 'wb') as fp:
    for d in json_data:
        fp.write(json.dumps(d).encode("utf-8"))
        fp.write("\n".encode('utf-8'))

In [None]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role = get_execution_role()

train_input = sagemaker_session.upload_data(
    path="./training_data.json",key_prefix='deepar/retail_forecast')

In [None]:
region = "ap-northeast-1"
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deepar-retail-forecast',
)


In [None]:
hyperparameters = {
    "time_freq": 'H',
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": "84",
    "prediction_length": "84"
}
estimator.set_hyperparameters(**hyperparameters)

In [None]:
estimator.fit(inputs={"train":train_input}, wait=True)

In [None]:
instance_type = 'ml.m4.xlarge'
predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type)

In [None]:
instances = json_data[104]
configuration = {"num_samples": 100, "output_types": ["samples"], "quantiles": ["0.1", "0.5", "0.9"]}
http_request_data = {"instances": [instances], "configuration": configuration}
byte_json = predictor.predict(json.dumps(http_request_data).encode("utf-8"))
result = json.loads(byte_json)
print(json_data[104])
print(result['predictions'][0]['samples'][0])