# DeepARを利用した売上予測

## データの準備
### ダウンロード
このノートブックでは、オンラインのショッピングサイトにおける販売履歴から売り上げを予測します。データセットは、UCIのMachine Learning Repositoryの[Online Retail Data Set](https://archive.ics.uci.edu/ml/datasets/online+retail)です。まず、販売履歴のxlsxファイルをダウンロードします。

In [198]:
import urllib.request
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx", "online_retail.xlsx")

('online_retail.xlsx', <http.client.HTTPMessage at 0x7f478ad49358>)

## データセットの確認
データ解析ライブラリPandasで読み込んで表示してみます。

In [200]:
from IPython.core.display import display
import pandas as pd
df = pd.read_excel("online_retail.xlsx", parse_dates=["InvoiceDate"])
display(df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


## 不要な行の削除

In [201]:
del df["InvoiceNo"], df["Description"], df["CustomerID"]

## トップ5の抽出

In [202]:
top_five_code = df["StockCode"].value_counts().head().index
top_five_record = df[df["StockCode"].isin(top_five_code)]
top_five_record.set_index(top_five_record["InvoiceDate"], drop=True, inplace=True)
del top_five_record["InvoiceDate"]
display(top_five_record.head())
#top_five_record.groupby("stock")

Unnamed: 0_level_0,StockCode,Quantity,UnitPrice,Country
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-01 08:26:00,85123A,6,2.55,United Kingdom
2010-12-01 09:02:00,85123A,6,2.55,United Kingdom
2010-12-01 09:32:00,85123A,6,2.55,United Kingdom
2010-12-01 09:37:00,20725,10,1.65,United Kingdom
2010-12-01 09:57:00,85099B,100,1.65,United Kingdom


## 1時間おきに揃える

In [203]:
qty = top_five_record.groupby(["StockCode", "Country"]).resample('H').sum()["Quantity"]
price = top_five_record.groupby(["StockCode", "Country"]).resample('H').mean()["UnitPrice"]
resampled = pd.concat([qty, price],axis=1, join="inner")
display(resampled.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Quantity,UnitPrice
StockCode,Country,InvoiceDate,Unnamed: 3_level_1,Unnamed: 4_level_1
20725,Australia,2010-12-17 14:00:00,10,1.65
20725,Australia,2010-12-17 15:00:00,0,
20725,Australia,2010-12-17 16:00:00,0,
20725,Australia,2010-12-17 17:00:00,0,
20725,Australia,2010-12-17 18:00:00,0,


## 欠損値を最頻値で埋める

In [204]:
import numpy as np
resampled["UnitPrice"] = resampled.groupby(['StockCode','Country'])['UnitPrice'].apply(lambda x: x.fillna(x.mode()[0]))
display(resampled.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Quantity,UnitPrice
StockCode,Country,InvoiceDate,Unnamed: 3_level_1,Unnamed: 4_level_1
20725,Australia,2010-12-17 14:00:00,10,1.65
20725,Australia,2010-12-17 15:00:00,0,1.65
20725,Australia,2010-12-17 16:00:00,0,1.65
20725,Australia,2010-12-17 17:00:00,0,1.65
20725,Australia,2010-12-17 18:00:00,0,1.65


# 標準化する

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(resampled.loc[:, ['Quantity', 'UnitPrice']].values)
resampled['Quantity'] = scaled[:, 0]
resampled['UnitPrice'] = scaled[:, 1]

## 入力に利用するjsonファイルを作成する


In [226]:
stockcode_arr = resampled.index.levels[0]
country_arr = resampled.index.levels[1]

json_data = []
for stock_index in range(len(stockcode_arr)):
    for country_index in range(len(country_arr)):
        one_data ={}
        try:
            record = resampled.loc[stockcode_arr[stock_index],country_arr[country_index]]
            one_data['start'] = str(record.index[0])
            one_data['target'] = record["Quantity"].values.tolist()
            one_data['cat'] = [stock_index, country_index]
            one_data['dynamic_feat'] = [record["UnitPrice"].values.tolist()]
            json_data.append(one_data)
        except:
            pass

import json
with open("./training_data.json", 'wb') as fp:
    for d in json_data:
        fp.write(json.dumps(d).encode("utf-8"))
        fp.write("\n".encode('utf-8'))

In [227]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role = get_execution_role()

train_input = sagemaker_session.upload_data(
    path="./training_data.json",key_prefix='deepar/retail_forecast')

In [228]:
region = "ap-northeast-1"
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(region, "forecasting-deepar", "latest")
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    base_job_name='deepar-retail-forecast',
)


In [229]:
hyperparameters = {
    "time_freq": 'H',
    "epochs": "400",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": "168",
    "prediction_length": "72"
}
estimator.set_hyperparameters(**hyperparameters)

In [None]:
estimator.fit(inputs={"train":train_input}, wait=True)

INFO:sagemaker:Creating training-job with name: deepar-retail-forecast-2018-10-05-00-14-34-276


.......

In [213]:
instance_type = 'ml.m4.xlarge'
predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type)

INFO:sagemaker:Creating model with name: forecasting-deepar-2018-10-04-23-43-42-436
INFO:sagemaker:Creating endpoint with name deepar-retail-forecast-2018-10-04-23-32-08-548


---------------------------------------------------------------!

In [217]:
instances = json_data[104]

##　Pick up the first 168 past sales  and 168 past prices + 72 future prices
data = {
    'start': instances['start'],
    'target': instances['target'][:168],
    'cat': instances['cat'],
    'dynamic_feat': instances['dynamic_feat'][:168+72],
}
configuration = {"num_samples": 100, "output_types": ["samples"], "quantiles": ["0.1", "0.5", "0.9"]}
http_request_data = {"instances": [data], "configuration": configuration}
byte_json = predictor.predict(json.dumps(http_request_data).encode("utf-8"))
result = json.loads(byte_json)

gt = instances['target'][:168]
prediction = result['predictions'][0]['samples'][0]


[6, 12, 102, 12, 9, 0, 36, 0, 262, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 18, 12, 0, 18, 1, 30, 50, 0, 166, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 13, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 14, 132, 1, 0, 32, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 21, 28, 6, 44, 8, 38, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 128, 38, -1, 100, 0, 16, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0.0289305598, 0.8252676725, 1.3761969805, 4.6711082458, 0.2779209614, -1.5657014847, -0.840724349, 2.8716850281, 1.9332346916, 0.2250470519, 0.307280302, -0.9478874803, -0.1842277646, -0.1710176617, -0.0899916664, -0.0381147936, -0.0374178961, -0.0222995281, -0.0606507435, -0.0068998542, 0.0538562611, 0.0667261183, 0.067075789, -0.0004428253, -0.108100377, -0.3005953133, -0.4387862086, -1.5591180325, -0.0306200087, -0.2811578214, -2.8452944756, 1.61550593