In [1]:
import os
import math
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

import torch
import torch.nn as nn
import torch.nn.functional as F


random.seed(12345)
np.random.seed(42)

warnings.simplefilter("ignore", DeprecationWarning)

In [2]:
series = pd.read_csv('pollution.csv', header=0, index_col=0)
raw_values = series.values

# integer encode wind direction
encoder = LabelEncoder()
raw_values[:,4] = encoder.fit_transform(raw_values[:,4])

In [3]:
series

Unnamed: 0_level_0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-02 00:00:00,129,-16,-4.0,1020.0,SE,1.79,0,0
2010-01-02 01:00:00,148,-15,-4.0,1020.0,SE,2.68,0,0
2010-01-02 02:00:00,159,-11,-5.0,1021.0,SE,3.57,0,0
2010-01-02 03:00:00,181,-7,-5.0,1022.0,SE,5.36,1,0
2010-01-02 04:00:00,138,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00,8,-23,-2.0,1034.0,NW,231.97,0,0
2014-12-31 20:00:00,10,-22,-3.0,1034.0,NW,237.78,0,0
2014-12-31 21:00:00,10,-22,-3.0,1034.0,NW,242.70,0,0
2014-12-31 22:00:00,8,-22,-4.0,1034.0,NW,246.72,0,0


In [4]:
raw_values.shape

(43800, 8)

In [8]:
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i-interval]
        diff.append(value)
    return pd.Series(diff)


# transform data to be stationary
diff2 = difference(raw_values, 1)
diff.shape

diff = pd.DataFrame(raw_values).diff(1).dropna().values

In [9]:
hidden_layers = [35,49,4]
batch_size = 219
dropout = 0.2
seq_len = 25
epochs_pre  = [625,115,933]
epochs_finetune = 197
window_size = 0
features = 8

In [11]:
def create_dataset(dataset, features, look_back=1):
    dataset = np.insert(dataset, [0]*look_back, 0)
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        dataY.append(dataset[i+look_back])
    dataY = np.array(dataY)
    dataY = np.reshape(dataY, (dataY.shape[0], features))
    dataset = np.concatenate((dataX, dataY), axis=1)
    return dataset
    

dataset2 = diff2.values
dataset2 = create_dataset(dataset2, features, window_size)

In [12]:
dataset = diff

In [13]:
dataset.shape

(43799, 8)

In [14]:
data = np.vstack((np.zeros((seq_len, features)), dataset))
data

array([[0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       [0, 0, 0.0, ..., 4.9199999999999875, 0, 0],
       [-2, 0, -1.0, ..., 4.02000000000001, 0, 0],
       [4, 1, 1.0, ..., 3.1299999999999955, 0, 0]], dtype=object)

In [15]:
# Dropnan = True!
history = np.hstack([data[seq_len-i:-i] for i in range(seq_len, 0, -1)])

In [16]:
n_out = 1
future = np.hstack([data[seq_len+i:] for i in range(n_out)])

In [36]:
future

array([[19, 1, 0.0, ..., 0.8900000000000001, 0, 0],
       [11, 4, -1.0, ..., 0.8899999999999997, 0, 0],
       [22, 4, 0.0, ..., 1.7900000000000005, 1, 0],
       ...,
       [0, 0, 0.0, ..., 4.9199999999999875, 0, 0],
       [-2, 0, -1.0, ..., 4.02000000000001, 0, 0],
       [4, 1, 1.0, ..., 3.1299999999999955, 0, 0]], dtype=object)

In [41]:
agg = np.hstack([history, future[:, 0].reshape(-1, 1)])

In [44]:
agg.shape

(43799, 201)

In [45]:
#convert series to supervised learning
def series_to_supervised(data, features, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    x = np.zeros(features, dtype=np.int)
    for i in range(n_in):
        data = np.insert(data, x, 0)
    data = data.reshape(int(data.shape[0]/features), features)
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ..., t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [(f"var{j+1}(t-{i})") for j in range(n_vars)]
    # forecast sequence (t, t+1, ..., t+n)
    for i in range(n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [(f"var{j+1}(t)") for j in range(n_vars)]
        else:
            names += [(f"var{j+1}(t+{i})") for j in range(n_vars)]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [46]:
# frame as supervised learning
# reframed = series_to_supervised(dataset, features, seq_len, 1, dropnan=False)
reframed = agg
# drop = [i for  i in  range(seq_len*features+1,((seq_len+1)*features))]
# reframed.drop(reframed.columns[drop], axis=1, inplace=True)
# reframed = reframed.values

In [47]:
# frame as supervised learning
reframed2 = series_to_supervised(dataset, features, seq_len, 1, dropnan=True)
drop = [i for  i in  range(seq_len*features+1,((seq_len+1)*features))]
reframed2.drop(reframed2.columns[drop], axis=1, inplace=True)
reframed2 = reframed2.values

In [48]:
# scale train and test data to [-1, 1]
def scale(train, test):
    # fit scaler
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train)
    # transform train
    train = train.reshape(train.shape[0], train.shape[1])
    train_scaled = scaler.transform(train)
    # transform test
    test = test.reshape(test.shape[0], test.shape[1])
    test_scaled = scaler.transform(test)
    return scaler, train_scaled, test_scaled

In [51]:
# split into train and test sets
train_size = 365*24*4
train, test = reframed[0:train_size], reframed[train_size:]

# transform the scale of the data
scaler, train_scaled, test_scaled = scale(train, test)

# split into input and outputs
x_train,y_train = train_scaled[:,0:-1],train_scaled[:,-1]
x_test,y_test = test_scaled[:,0:-1],test_scaled[:,-1]

# reshape input to be 3D [samples, timesteps, features]
x_train = x_train.reshape(x_train.shape[0], seq_len, features)
x_test = x_test.reshape(x_test.shape[0], seq_len, features)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(35040, 25, 8) (35040,) (8759, 25, 8) (8759,)
