# Keras Time Series Generator

In [84]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences


# Univariate

In [4]:
series = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [8]:
n_input = 2

# Generator with 2 inputs per row and one row per batch

generator = TimeseriesGenerator(series, series, length=n_input, batch_size=1)

In [9]:
len(generator)

8

In [10]:
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[1. 2.]] => [3.]
[[2. 3.]] => [4.]
[[3. 4.]] => [5.]
[[4. 5.]] => [6.]
[[5. 6.]] => [7.]
[[6. 7.]] => [8.]
[[7. 8.]] => [9.]
[[8. 9.]] => [10.]


## Simple Model

In [14]:
generator = TimeseriesGenerator(series, series, length=2, batch_size=8)

In [15]:
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[1. 2.]
 [2. 3.]
 [3. 4.]
 [4. 5.]
 [5. 6.]
 [6. 7.]
 [7. 8.]
 [8. 9.]] => [ 3.  4.  5.  6.  7.  8.  9. 10.]


In [18]:
model = Sequential()

model.add(Dense(100, activation="relu", input_shape=(2, )))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [19]:
model.fit_generator(generator, steps_per_epoch=1, epochs=200, verbose=0)

<keras.callbacks.History at 0x7fcdc13510f0>

In [21]:
x_input = np.array([9, 10]).reshape((1, n_input))

In [22]:
x_input

array([[ 9, 10]])

In [23]:
yhat = model.predict(x_input, verbose=0)

In [24]:
yhat

array([[11.451381]], dtype=float32)

## Reshaping before feeding it into the TSG for LSTM

The LSTM expects data input to have the shape [samples, timesteps, features], whereas the generator described so far is providing lag observations as features or the shape [samples, features].

We can reshape the univariate time series prior to preparing the generator from [10, ] to [10, 1] for 10 time steps and 1 feature

The TimeseriesGenerator will then split the series into samples with the shape [batch_size, timesteps, features] or [8, 2, 1] for all eight samples in the generator and the two lag observations used as time steps.

In [25]:
n_features = 1
series = series.reshape(len(series), n_features)

In [26]:

series

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])

In [27]:
n_input = 2

generator = TimeseriesGenerator(series, series, length=n_input, batch_size=8)

In [32]:
for i in range(len(generator)):
    x, y = generator[i]
    print(x)
    print(y)

[[[1.]
  [2.]]

 [[2.]
  [3.]]

 [[3.]
  [4.]]

 [[4.]
  [5.]]

 [[5.]
  [6.]]

 [[6.]
  [7.]]

 [[7.]
  [8.]]

 [[8.]
  [9.]]]
[[ 3.]
 [ 4.]
 [ 5.]
 [ 6.]
 [ 7.]
 [ 8.]
 [ 9.]
 [10.]]


In [39]:
model = Sequential()

# using 2 points to predict 1 step ahead, hence, input shape is 2 and number of features is 1
model.add(LSTM(100, activation='relu', input_shape=(2, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')


In [None]:
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=0)

In [41]:
# make a one step prediction out of sample
x_input = np.array([9, 10]).reshape((1, n_input, n_features))
yhat = model.predict(x_input, verbose=0)
print(yhat)

[[11.040965]]


# Multivariate Example predict n using n 

In [46]:
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105])

# reshape series
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))

In [47]:
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2))

In [49]:
dataset.shape

(10, 2)

In [54]:
timesteps = 2

generator = TimeseriesGenerator(dataset, dataset, length=timesteps, batch_size=8)

Each sample will then be a three-dimensional array of [1, 2, 2] for the 1 sample, 2 time steps, and 2 features or parallel series. The output will be a two-dimensional series of [1, 2] for the 1 sample and 2 features. 

In [55]:
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[[10. 15.]
  [20. 25.]]

 [[20. 25.]
  [30. 35.]]

 [[30. 35.]
  [40. 45.]]

 [[40. 45.]
  [50. 55.]]

 [[50. 55.]
  [60. 65.]]

 [[60. 65.]
  [70. 75.]]

 [[70. 75.]
  [80. 85.]]

 [[80. 85.]
  [90. 95.]]] => [[ 30.  35.]
 [ 40.  45.]
 [ 50.  55.]
 [ 60.  65.]
 [ 70.  75.]
 [ 80.  85.]
 [ 90.  95.]
 [100. 105.]]


In [56]:
# define generator
n_features = dataset.shape[1]
n_input = 2
generator = TimeseriesGenerator(dataset, dataset, length=n_input, batch_size=8)
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')

In [57]:
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=0)
# make a one step prediction out of sample
x_input = np.array([[90, 95], [100, 105]]).reshape((1, n_input, n_features))
yhat = model.predict(x_input, verbose=0)
print(yhat)

[[111.84365 117.2639 ]]


# Multivariate Inputs and Dependent Series Example

There are multivariate time series problems where there are one or more input series and a separate output series to be forecasted that is dependent upon the input series.

In [157]:
# define dataset
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105])
out_seq = np.array([25, 45, 65, 85, 105, 125, 145, 165, 185, 205])

In [158]:
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))

In [159]:
dataset = np.hstack((in_seq1, in_seq2))

In [68]:
generator = TimeseriesGenerator(dataset, out_seq, length=1, batch_size = 1)

In [69]:
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[[10. 15.]]] => [[45.]]
[[[20. 25.]]] => [[65.]]
[[[30. 35.]]] => [[85.]]
[[[40. 45.]]] => [[105.]]
[[[50. 55.]]] => [[125.]]
[[[60. 65.]]] => [[145.]]
[[[70. 75.]]] => [[165.]]
[[[80. 85.]]] => [[185.]]
[[[90. 95.]]] => [[205.]]


Running the example prints the input and output portions of the samples with the output values for the next time step rather than the current time step as we may desire for this type of problem.

We can therefore modify the target series (out_seq) and insert an additional value at the beginning in order to push all observations down by one time step.

In [93]:
from numpy import insert, delete

In [73]:
out_seq

array([[ 25],
       [ 45],
       [ 65],
       [ 85],
       [105],
       [125],
       [145],
       [165],
       [185],
       [205]])

In [78]:
out_seq = insert(out_seq, 0, 0).reshape(-1, 1)

In [90]:
# define dataset
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105])
out_seq = np.array([25, 45, 65, 85, 105, 125, 145, 165, 185, 205])
# reshape series
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2))


In [94]:
out_seq = np.delete(out_seq, -1)

In [96]:
out_seq = np.insert(out_seq, 0, 0)

In [104]:
# define generator
n_input = 1
generator = TimeseriesGenerator(dataset, out_seq, length=n_input, batch_size=1)
# print each sample
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[[10. 15.]]] => [25.]
[[[20. 25.]]] => [45.]
[[[30. 35.]]] => [65.]
[[[40. 45.]]] => [85.]
[[[50. 55.]]] => [105.]
[[[60. 65.]]] => [125.]
[[[70. 75.]]] => [145.]
[[[80. 85.]]] => [165.]
[[[90. 95.]]] => [185.]


In [105]:
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(1, 2)))
model.add(Dense(1))


In [106]:
model.compile(optimizer='adam', loss='mse')
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=0)

<keras.callbacks.History at 0x7fcd7c05f5f8>

In [107]:
# make a one step prediction out of sample
x_input = np.array([[100, 105]]).reshape((1, n_input, n_features))
yhat = model.predict(x_input, verbose=0)
print(yhat)

[[204.51329]]


In [108]:
x_input

array([[[100, 105]]])

# Uni-Variate Multistep Forecast

In [125]:
series = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
target = np.array([[1,2],[2,3],[3,4],[4,5],[5,6],[6,7],[7,8],[8,9],[9,10],[10,11]])

series = series.reshape(len(series), 1)
target = target.reshape(len(target), 2)

In [126]:
target.shape

(10, 2)

In [135]:
timesteps = 2
generator = TimeseriesGenerator(series, target, length=timesteps, batch_size=8)
# print each sample
for i in range(len(generator)):
	x, y = generator[i]
	print('%s => %s' % (x, y))

[[[1.]
  [2.]]

 [[2.]
  [3.]]

 [[3.]
  [4.]]

 [[4.]
  [5.]]

 [[5.]
  [6.]]

 [[6.]
  [7.]]

 [[7.]
  [8.]]

 [[8.]
  [9.]]] => [[ 3.  4.]
 [ 4.  5.]
 [ 5.  6.]
 [ 6.  7.]
 [ 7.  8.]
 [ 8.  9.]
 [ 9. 10.]
 [10. 11.]]


In [136]:
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(2, 1)))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')

In [137]:
model.fit_generator(generator, steps_per_epoch=1, epochs=500, verbose=0)

<keras.callbacks.History at 0x7fcd24be0d68>

In [138]:
x_input = np.array([[9, 10]]).reshape((1, 2, 1))
yhat = model.predict(x_input, verbose=0)
print(yhat)

[[11.005036 11.907664]]


increasing the batch size from 1 to 8 makes it more accurate

# Test on data

In [139]:
import pandas as pd
import h2o
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns; sns.set()
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import numpy as np

In [140]:
df = pd.read_csv('https://raw.githubusercontent.com/sagarmk/Forecasting-on-Air-pollution-with-RNN-LSTM/master/pollution.csv',index_col=0)

In [141]:
df.head()

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [142]:
df_train = df.iloc[:int(df.shape[0]*0.8), :]
df_test = df.iloc[int(df.shape[0]*0.8): , :]

In [143]:
df_train.drop(['wnd_dir'], axis=1, inplace=True)
df_test.drop(['wnd_dir'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [144]:
sc = MinMaxScaler()

In [145]:
df_train_scaled = sc.fit_transform(df_train)
df_test_scaled = sc.transform(df_test)

  return self.partial_fit(X, y)


In [147]:
df_train_scaled.shape

(35040, 7)

In [236]:
y_train = df_train_scaled[:, 0].reshape(-1, 1)

In [237]:
X_train = df_train_scaled[:, 1:].reshape(-1, 6)

In [292]:
# to make sure test has the 60 values before
df_total = np.concatenate([df_train_scaled, df_test_scaled], axis = 0)
inputs = df_total[len(df_total) - len(df_test)-60: , :]

In [295]:
X_test = inputs[:, 1:].reshape(-1, 6)
y_test = inputs[:, 0].reshape(-1, 1)

In [271]:
timesteps = 60
generator = TimeseriesGenerator(X_train, y_train, length=timesteps, batch_size = 32)

In [296]:
test_generator = TimeseriesGenerator(X_test, y_test, length=timesteps, batch_size=32)

In [272]:
model = Sequential()

model.add(LSTM(32,
               return_sequences=True,
               stateful=False,
               input_shape=(timesteps, 6)))
#model.add(Dropout(0.2))

model.add(LSTM(16,
               return_sequences=True,
               stateful=False))
#model.add(Dropout(0.2))

model.add(LSTM(8,
             return_sequences=False,
             stateful=False))
#model.add(Dropout(0.2))

model.add(Dense(1))

In [273]:
model.compile(optimizer='adam', loss='mse')

In [274]:
model.fit_generator(generator, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fccf74f4b70>

In [297]:
model.evaluate_generator(test_generator)

0.004933084239112107

In [298]:
model.metrics_names

['loss']

In [299]:
lstm_predictions = model.predict_generator(test_generator)

In [300]:
lstm_predictions.shape

(8760, 1)

In [308]:
mean_absolute_error(y_test[60:], lstm_predictions)

0.0494471003207927

In [309]:
mean_squared_error(y_test[60:], lstm_predictions)

0.004933084239273742

In [320]:
b = sc.inverse_transform(np.concatenate((lstm_predictions, df_test_scaled[:, 1:]), axis=1))[:, 0]

In [325]:
a = sc.inverse_transform(df_test_scaled)[:, 0]

In [326]:
mean_absolute_error(b, a)

49.150417718867935

In [327]:
mean_squared_error(b, a)

4874.064819435071