In [125]:
#from numpy import array

from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Dropout
from keras.layers import LSTM

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

#load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
#dataset.drop("wnd_dir", axis=1, inplace=True)
values = dataset.values

#integer encode wind direction, as it's the only categorical variable.
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

#ensure all data are float32 values
values = values.astype('float32')

# #normalize input features
# # Split before scaling. Apply the scaling separately to both train and test sets. 
# scaler = MinMaxScaler(feature_range=(0, 1))
# #scaler = StandardScaler()
# scaled = scaler.fit_transform(values)

#frame as supervised learning
n_hours = 1 
n_features = 8 
reframed = series_to_supervised(values, n_hours, 1)
values = reframed.values
n_train_hours = 365*24*4# This is for four years
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]


#normalize input features
# Split before scaling. Apply the scaling separately to both train and test sets. 
scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

#CHANGES HERE
#split into input and outputs
n_obs = n_hours * n_features
train_X = train_scaled[:, :n_obs]
train_y = train_scaled[:, -n_features:(-n_features+4)] #+2 because of indexing madness.
test_X = test_scaled[:, :n_obs]
test_y = test_scaled[:, -n_features:(-n_features+4)]



train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))

#CHANGES HERE
#Need to output two values, not one.
#design network
model = Sequential()
#model.add(LSTM(10, activation='relu', return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(LSTM(20, activation='relu', return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.1))
model.add(LSTM(10, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dropout(0.1))
model.add(Dense(4)) #changed from 1 to 2.
model.compile(loss='mse', optimizer='adam')

#fit network
history = model.fit(train_X, train_y, epochs=10, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)

#make a prediction
y_hat = model.predict(test_X)


Epoch 1/10
487/487 - 5s - loss: 0.0719 - val_loss: 0.0297 - 5s/epoch - 11ms/step
Epoch 2/10
487/487 - 2s - loss: 0.0227 - val_loss: 0.0165 - 2s/epoch - 3ms/step
Epoch 3/10
487/487 - 2s - loss: 0.0130 - val_loss: 0.0094 - 2s/epoch - 3ms/step
Epoch 4/10
487/487 - 2s - loss: 0.0107 - val_loss: 0.0079 - 2s/epoch - 3ms/step
Epoch 5/10
487/487 - 2s - loss: 0.0093 - val_loss: 0.0081 - 2s/epoch - 3ms/step
Epoch 6/10
487/487 - 2s - loss: 0.0085 - val_loss: 0.0087 - 2s/epoch - 3ms/step
Epoch 7/10
487/487 - 2s - loss: 0.0078 - val_loss: 0.0089 - 2s/epoch - 3ms/step
Epoch 8/10
487/487 - 2s - loss: 0.0072 - val_loss: 0.0100 - 2s/epoch - 3ms/step
Epoch 9/10
487/487 - 2s - loss: 0.0066 - val_loss: 0.0101 - 2s/epoch - 3ms/step
Epoch 10/10
487/487 - 2s - loss: 0.0062 - val_loss: 0.0094 - 2s/epoch - 3ms/step


In [126]:
print(train_X.shape)
print(test_X.shape)
print("Test shape", test.shape)
print("Train shape", train.shape)

(35040, 1, 8)
(8759, 1, 8)
Test shape (8759, 16)
Train shape (35040, 16)


In [92]:
import numpy as np
print(train_X.shape)
b = np.squeeze(train_X, axis=1)
print(b.shape)
c = np.repeat(b,2,axis=1)
print(c.shape)

(35040, 1, 8)
(35040, 8)
(35040, 16)


In [127]:
test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
# test_X2 = np.repeat(test_X,2,axis=-1)
test_X = np.squeeze(train_X, axis=1)
test_X = np.repeat(test_X,2,axis=1)
print(test_X.shape)

(35040, 16)


In [129]:
import numpy as np
#CHANGES HERE
test_X = test_X.reshape((test_X.shape[0], 2, n_hours*n_features))
test_X = np.squeeze(test_X, axis=1)
test_X = np.repeat(test_X, 2, axis=1)
# test_X2 = np.repeat(test_X,2,axis=-1)
# inv_yhat = concatenate((y_hat, test_X[:,-4:]), axis=1) #changed 7 to 6
# inv_yhat = concatenate((y_hat, test_X2[:,-4:]), axis=1) #changed 7 to 6
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0:4] #changed from 0 to 0:2. Should be first 2 columns that contain the predictions


ValueError: cannot select an axis to squeeze out which has size not equal to one

In [51]:
#CHANGES HERE
#invert scaling for actual
test_y = test_y.reshape((len(test_y),4)) #changed 1 to 2
inv_y = concatenate((test_y, test_X[:,-4:]), axis=1) #changed 7 to 6
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0:4] #changed from 0 to 0:2. Should be first 2 columns that contain the predictions.


In [53]:
inv_yhat
inv_yhat.shape

(8759, 4)

In [54]:
inv_y.shape

(8759, 4)

In [132]:
import math
def rms(x,y):
    total = 0
    for i in range(len(x)):
        total += ((x[i]-y[i])**2)
    return math.sqrt(total/len(x))


p = [10,20,30,40]
q = [11,17,29,40]
print(rms(p,q))

1.6583123951777


In [63]:
print(rms(inv_y[:,1], inv_yhat[:,1]))

6.331406610915704


In [148]:
val = [x for x in range(100,1000,100)]
val2 = [k + 2 for k in val]
# del val2[3]
print(rms(val,val2))

2.0
