In [0]:
%matplotlib inline

import pandas as pd
from datetime import datetime
import numpy as np

In [0]:
def parse(x):
	return datetime.strptime(x, '%Y %m %d %H')
 
dataset = pd.read_csv('./PRSA_data_2010.1.1-2014.12.31.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('pollution.csv')

In [0]:
df = dataset[['pollution', 'temp', 'press', 'wnd_spd']]

In [0]:
df.head()

In [0]:
dataset = df.values.astype('float32')

In [0]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

In [0]:
dataset[0]

In [0]:
dataset.shape

In [0]:
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]

In [0]:
train.shape

In [0]:
def create_dataset(dataset, look_back=5):
  dataX, dataY = [], []
  for i in range(len(dataset) - look_back - 1):
    a = []
    b = []
    for j in range(dataset.shape[1]):
      a = dataset[i:(i+look_back), j]
      b.append(a)
    dataY.append(dataset[i+look_back, 0])
    dataX.append(b)
  
  return np.array(dataX), np.array(dataY)

In [0]:
look_back = 5
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

In [0]:
trainX.shape

In [0]:
trainX[0: 2]

In [0]:
trainX = np.reshape(trainX, (trainX.shape[0], look_back, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], look_back, testX.shape[1]))

In [0]:
trainX.shape

In [0]:
trainX[0:2]

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM

In [0]:
model = Sequential()

model.add(LSTM(5, input_shape=(look_back, trainX.shape[2])))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
model.fit(trainX, trainY, epochs=100, batch_size=100, verbose=2)

In [0]:
import math
from sklearn.metrics import mean_squared_error

trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

In [0]:
pad_col = np.zeros(dataset.shape[1]-1)
def pad_array(val):
    return np.array([np.insert(pad_col, 0, x) for x in val])

trainPredict = scaler.inverse_transform(pad_array(trainPredict))
trainY = scaler.inverse_transform(pad_array(trainY))
testPredict = scaler.inverse_transform(pad_array(testPredict))
testY = scaler.inverse_transform(pad_array(testY))

In [0]:
trainScore = math.sqrt(mean_squared_error(trainY[:,0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[:,0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))


In [0]:
trainPredict[0:3, 0]


In [0]:
trainY[0:3, 0]