In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import datetime

# This code is derived from AWS SageMaker Samples:
# https://github.com/awslabs/amazon-sagemaker-examples/tree/master/introduction_to_amazon_algorithms/deepar_electricity
# https://github.com/awslabs/amazon-sagemaker-examples/tree/master/introduction_to_amazon_algorithms/deepar_synthetic

# DeepAR - Kaggle Bike Sharing Demand Dataset

Prepare Bike Rental Data for DeepAR training

To download original dataset, sign-in and download from this link: https://www.kaggle.com/c/bike-sharing-demand/data

None of these features are used: ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

Start Time From: ['datetime']

Target Feature: ['count','registered','casual']

Frequency: 'Hourly'

In [3]:
target_values = ['count','registered','casual']

# controls if categories (in this case we are using it to indicate one of the above rentals) needs to be
# included in the training and test data
with_categories = False

# Set datetime column as index to work with data based on Date/Time
df = pd.read_csv('train.csv', parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('test.csv', parse_dates=['datetime'],index_col=0)

In [4]:
# Minimum time and Maximum Time in Training CSV file
df.index.min(),df.index.max()

(Timestamp('2011-01-01 00:00:00'), Timestamp('2012-12-19 23:00:00'))

In [5]:
df_test.head(25) # data check

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 05:00:00,1,0,1,1,9.84,11.365,60,15.0013
2011-01-20 06:00:00,1,0,1,1,9.02,10.605,60,15.0013
2011-01-20 07:00:00,1,0,1,1,9.02,10.605,55,15.0013
2011-01-20 08:00:00,1,0,1,1,9.02,10.605,55,19.0012
2011-01-20 09:00:00,1,0,1,2,9.84,11.365,52,15.0013


## Group the dataset to year and month

Determine how far in the future predictions to be made by using *prediction_length* hyperparameter

In [6]:
# How many hours required for prediction in a month using test.csv data
hours_to_predict = []
print('Check maximum hours I need to predict')
# Group by year, month
predict_window = df_test.groupby([df_test.index.year, df_test.index.month])

for i,x in predict_window:
    delta = x.index.max() - x.index.min()
    hours = np.ceil(delta.total_seconds()/3600)
    hours_to_predict.append(hours)
    
    print("{0}, Hours:{1}".format(i, hours))

print("Maximum Prediction Length in Hours: ", np.max(hours_to_predict))

Check maximum hours I need to predict
(2011, 1), Hours:287.0
(2011, 2), Hours:215.0
(2011, 3), Hours:287.0
(2011, 4), Hours:263.0
(2011, 5), Hours:287.0
(2011, 6), Hours:263.0
(2011, 7), Hours:287.0
(2011, 8), Hours:287.0
(2011, 9), Hours:263.0
(2011, 10), Hours:287.0
(2011, 11), Hours:263.0
(2011, 12), Hours:287.0
(2012, 1), Hours:287.0
(2012, 2), Hours:239.0
(2012, 3), Hours:287.0
(2012, 4), Hours:263.0
(2012, 5), Hours:287.0
(2012, 6), Hours:263.0
(2012, 7), Hours:287.0
(2012, 8), Hours:287.0
(2012, 9), Hours:263.0
(2012, 10), Hours:287.0
(2012, 11), Hours:263.0
(2012, 12), Hours:287.0
Maximum Prediction Length in Hours:  287.0


### Setting the hyperparameters for DeepAR

Ref:[DeepAR Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html)

In [23]:
freq = '1h' # Data is in hourly format

# how far in the future?
prediction_length = 288

# Context, or how far in the past? AWS recommends the same size as starting point
context_length = 288

In [24]:
dt_predict_max = pd.to_datetime("2012-12-31 23:00:00") # 2012-12-31 23:00 alt way..pd.datetime(2012,12,31,23,0,0)

dt_dataset_start_time = "2011-01-01 00:00:00"
dt_dataset_end_time = "2012-12-19 23:00:00"

dt_range_train = pd.date_range(start=dt_dataset_start_time, end=dt_dataset_end_time, freq=freq)

# use for model training
# Start time is the first row provided by kaggle
# Training TS end time ensures some data is withheld for model testing
# 12 days worth of training data is withheld for testing
#dt_train_range = (dt_dataset_start_time,
#                  dt_dataset_end_time - datetime.timedelta(hours=12*24))

# Use entire data for testing
# We can compare predicted values vs actual (i.e. last 12 days is withheld for testing and model hasn't seen that data)
#dt_test_range = (dt_dataset_start_time, 
#                 dt_dataset_end_time)

In [22]:
dt_range_train

AttributeError: 'DatetimeIndex' object has no attribute 'head'

Numpy bug is conflicting with pandas v1.03 - Project Halted! Doing research

In [9]:
dt_predict_max, dt_predict_max+1

TypeError: Addition/subtraction of integers and integer-arrays with Timestamp is no longer supported.  Instead of adding/subtracting `n`, use `n * obj.freq`