### To do list
* Update np notations (np.split, np.array)
* Write a function to use MAE instead of MSE

# 1 Time Series 7-Day Forecasting with LSTM<a id='1_Time_Series_7-Day_Forecasting_with_LSTM'></a>

## 1.1 Contents<a id='1.1_Contents'></a>
* [1 Time Series 7-Day Forecasting with LSTM](#1_Time_Series_7-Day_Forecasting_with_LSTM)
    * [1.1 Contents](#1.1_Contents)
    * [1.2 Import](#1.2_Import)

## 1.2 Import<a id='1.2_Import'></a>

In [62]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import graphviz
import pydot_ng as pydot
from deepdiff import DeepDiff
from sklearn.metrics import mean_absolute_error
from sklearn.tree import export_graphviz
from math import sqrt
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras_visualizer import visualizer 
from keras import layers 
#from keras_visualizer import visualizer
from keras.utils.vis_utils import plot_model
#keras.utils.vis_utils.pydot = pydot
%matplotlib inline

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize=14,
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)

#### Split dataset to train and test sets

In [2]:
def split_dataset(data, n_test):
	# split into standard weeks
	train, test = data[:-n_test,:], data[-n_test:,:]
	# restructure into windows of weekly data
	train = np.array(np.split(train, len(train)/7))
	test = np.array(np.split(test, len(test)/7))
	return train, test

#### Evaluate one or more weekly forecasts against expected values

In [3]:
def evaluate_forecasts(actual, predicted):
	scores = list()
	# calculate an RMSE score for each day
	for i in range(actual.shape[1]):
		# calculate mse
		mse = mean_squared_error(actual[:, i], predicted[:, i])
		# calculate rmse
		rmse = sqrt(mse)
		# store
		scores.append(rmse)
	# calculate overall RMSE
	s = 0
	for row in range(actual.shape[0]):
		for col in range(actual.shape[1]):
			s += (actual[row, col] - predicted[row, col])**2
	score = sqrt(s / (actual.shape[0] * actual.shape[1]))
	return score, scores

#### Summarize scores

In [4]:
def summarize_scores(name, score, scores):
	s_scores = ', '.join(['%.1f' % s for s in scores])
	print('%s: [%.3f] %s' % (name, score, s_scores))

#### Convert history into inputs and outputs

In [5]:
def to_supervised(train, n_input, n_out=7):
	# flatten data
	data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
	X, y = list(), list()
	in_start = 0
	# step over the entire history one time step at a time
	for _ in range(len(data)):
		# define the end of the input sequence
		in_end = in_start + n_input
		out_end = in_end + n_out
		# ensure we have enough data for this instance
		if out_end <= len(data):
			X.append(data[in_start:in_end, :]) # note ':' instead of '0' to include all the inputs
			y.append(data[in_end:out_end, 0])
		# move along one time step
		in_start += 1
	return np.array(X), np.array(y)

#### Build/Train the model

In [6]:
def build_model(train, n_input, n_out=7):
    # prepare data
    train_x, train_y = to_supervised(train, n_input, n_out=7)
    # define parameters
    verbose, epochs, batch_size = 0, 50, 16
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    # reshape output into [samples, timesteps, features]
    train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
    # define model
    model = Sequential()
    model.add(LSTM(200, activation='relu', input_shape=(n_timesteps, n_features)))
    model.add(RepeatVector(n_outputs))
    model.add(LSTM(200, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(100, activation='relu')))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mse', optimizer='adam')
    visualizer(model, format='png', view=True)
    # fit network
    model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, verbose=verbose)
    return model

#### Make a forecast

In [7]:
def forecast(model, history, n_input):
	# flatten data
	data = array(history)
	data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
	# retrieve last observations for input data
	input_x = data[-n_input:, :]
	# reshape into [1, n_input, n]
	input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
	# forecast the next week
	yhat = model.predict(input_x, verbose=0)
	# we only want the vector forecast
	yhat = yhat[0]
	return yhat

#### Evaluate a single model

In [8]:
def evaluate_model(train, test, n_input, n_out=7):
	# fit model
	model = build_model(train, n_input, n_out=7)
	# history is a list of weekly data
	history = [x for x in train]
	# walk-forward validation over each week
	predictions = list()
	for i in range(len(test)):
		# predict the week
		yhat_sequence = forecast(model, history, n_input)
		# store the predictions
		predictions.append(yhat_sequence)
		# get real observation and add to history for predicting the next week
		history.append(test[i, :])
	# evaluate predictions days for each week
	predictions = array(predictions)
	score, scores = evaluate_forecasts(test[:, :, 0], predictions)
	return score, scores, predictions, test[:, :, 0]

#### Load and slice the data

In [11]:
# load the saved dictionary from pickle file
filePath_pickle = Path('/Users/parkj/Documents/pyDat/dataSet/covid_countryData.pickle')
pickle_in = open(filePath_pickle, 'rb')
country_dict = pickle.load(pickle_in)
# Select US national data
df = country_dict['United States']
# Slicing data from first Monday to last Sunday
df = df[min(df.index[df['dayow']==0]):max(df.index[df['dayow']==6])] 

In [33]:
# time series (non-case) features 
features_lag = ['rtrc', 'grph', 'prks', 'tran', 'work', 'resi', 'vac', 'dayow']
# non-case features (mobility, vaccination, dayow) with lagging for previous 7 days
df_case = pd.DataFrame(df.case_mil_percMax)
dataset = df_case.join(df.loc[:,features_lag], on='date', how='left')
# split into train and test
n_test = 98 # (Days) 7 x 14, test model on the latest 14 weeks of data
train, test = split_dataset(dataset.values, n_test) # splits dataset into train and test sets and reshape them to [samples, timesteps, features] (try train.shape)
# evaluate model and get scores
n_input = 28 # input four weeks of data 
n_out = 7 # forecast next 7 days
score, scores, yhat, y = evaluate_model(train, test, n_input, n_out)

Unnamed: 0_level_0,case_mil_percMax,rtrc,grph,prks,tran,work,resi,vac,dayow
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-02-17,0.000000,6.0,0.0,28.0,-9.0,-24.0,5.0,0.00,0
2020-02-18,0.000000,0.0,-1.0,6.0,1.0,0.0,1.0,0.00,1
2020-02-19,0.000000,2.0,0.0,8.0,1.0,1.0,0.0,0.00,2
2020-02-20,0.000000,1.0,0.0,4.0,0.0,0.0,1.0,0.00,3
2020-02-21,0.000661,2.0,-2.0,4.0,1.0,0.0,0.0,0.00,4
...,...,...,...,...,...,...,...,...,...
2021-06-02,5.628968,-2.0,5.0,40.0,-23.0,-30.0,7.0,50.45,2
2021-06-03,6.348232,-3.0,6.0,37.0,-23.0,-29.0,7.0,50.56,3
2021-06-04,5.611342,-5.0,2.0,44.0,-20.0,-28.0,6.0,50.75,4
2021-06-05,4.628894,-6.0,5.0,74.0,-9.0,-8.0,0.0,50.91,5
