In [1]:
import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import datetime
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import time

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# split data in 80%/10%/10% train/validation/test sets
valid_set_size_percentage = 10 
test_set_size_percentage = 10 

import warnings
warnings.filterwarnings('ignore')

In [2]:
print(tf.__version__)

1.15.0


### Model Training 

In [6]:
num_data_df = pd.read_csv('num_data.csv')

In [7]:
num_data_df.head()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
2013-03-01 00:00:00,2013,3,1,0,6.0,18.0,5.0,43.0,800.0,88.0,0.1,1021.1,-18.6,0.0,5.497787,4.4
2013-03-01 01:00:00,2013,3,1,1,6.0,15.0,5.0,43.0,800.0,88.0,-0.3,1021.5,-19.0,0.0,5.497787,4.0
2013-03-01 02:00:00,2013,3,1,2,5.0,18.0,7.0,43.0,700.0,52.0,-0.7,1021.5,-19.8,0.0,5.105088,4.6
2013-03-01 03:00:00,2013,3,1,3,6.0,20.0,6.0,43.0,900.0,45.0,-1.0,1022.7,-21.2,0.0,4.712389,2.8
2013-03-01 04:00:00,2013,3,1,4,5.0,17.0,5.0,43.0,600.0,73.0,-1.3,1023.0,-21.4,0.0,5.105088,3.6


In [18]:
standardScaler=StandardScaler()
num_data_scaled_df = standardScaler.fit_transform(num_data_df)

In [6]:
seq_len = 24

# function to create train, validation, test data given stock data and sequence length
# the training sets are the sequences (20)
# this is the methods of time series prediction 
def load_data(data_raw, seq_len):
#     data_raw = stock.as_matrix() # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len): 
        data.append(data_raw[index: index + seq_len])
    
    data = np.array(data);
    valid_set_size = int(np.round(valid_set_size_percentage/100*data.shape[0]));  
    test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]));
    train_set_size = data.shape[0] - (valid_set_size + test_set_size);
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_valid = data[train_set_size:train_set_size+valid_set_size,:-1,:]
    y_valid = data[train_set_size:train_set_size+valid_set_size,-1,:]
    
    x_test = data[train_set_size+valid_set_size:,:-1,:]
    y_test = data[train_set_size+valid_set_size:,-1,:]
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test]

In [7]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_data(num_data_scaled_df, seq_len)

In [8]:
x_train

array([[[-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.45433924,  2.1433824 ],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.45433924,  1.82234119],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.24726309,  2.30390301],
        ...,
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
         -0.40934606,  0.13687482],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          0.00480622, -0.82624881],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
         -0.61642221, -0.5854679 ]],

       [[-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.45433924,  1.82234119],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.24726309,  2.30390301],
        [-1.41230431, -1.02152297, -1.67380491, ..., -0.0784963 ,
          1.04018695,  0.85921755],
        ...,
        [-1.41230431, -1.02152297, -1.67380491, ..., -

In [9]:
## Basic Cell RNN in tensorflow
index_in_epoch = 0;
perm_array  = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

In [10]:
# parameters

# We can divide the dataset of 2000 examples into batches of 500 
# then it will take 4 iterations to complete 1 epoch

n_steps = seq_len-1 
n_inputs = 16
n_neurons = 2 
n_outputs = 16
n_layers = 2
learning_rate = 0.001
batch_size = 50
n_epochs = 50 
train_set_size = x_train.shape[0]
test_set_size = x_test.shape[0]

In [11]:
tf.reset_default_graph()

# feed data into the graph through these placeholders.
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_outputs])

In [12]:
# use Basic RNN Cell
rnn_layer = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.elu)

# use Basic LSTM Cell 
lstm_layer = tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons, activation=tf.nn.elu)

# use Basic GRU cell
gru_layer = tf.contrib.rnn.GRUCell(num_units=n_neurons, activation=tf.nn.leaky_relu)

# use LSTM Cell with peephole connections
#layers = [tf.contrib.rnn.LSTMCell(num_units=n_neurons, 
#                                  activation=tf.nn.leaky_relu, use_peepholes = True)
#          for layer in range(n_layers)]

layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons, activation=tf.nn.elu)
          for layer in range(n_layers)]


GRU = tf.contrib.rnn.MultiRNNCell(cells=[gru_layer])
LSTM = tf.contrib.rnn.MultiRNNCell(cells=[lstm_layer])

GRU_GRU =  tf.contrib.rnn.MultiRNNCell(cells=[gru_layer,lstm_layer])
GRU_LSTM = tf.contrib.rnn.MultiRNNCell(cells=[gru_layer, lstm_layer])
LSTM_GRU = tf.contrib.rnn.MultiRNNCell(cells=[lstm_layer,gru_layer])
LSTM_LSTM = tf.contrib.rnn.MultiRNNCell(cells=[lstm_layer,lstm_layer])


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


In [13]:
multi_layer_cell = LSTM_GRU

#Creates a recurrent neural network specified by RNNCell cell
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32) 

stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons]) 
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs) # Functional interface for the densely-connected layer
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
outputs = outputs[:,n_steps-1,:] # keep only last output of sequence



Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [14]:
loss = tf.reduce_mean(tf.square(outputs - y)) # loss function = mean squared error 

# Instead of adapting the parameter learning rates based on the average first moment (the mean) as in 
# RMSProp, Adam also makes use of the average of the second moments of the gradients (the uncentered variance).
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 
training_op = optimizer.minimize(loss)

In [15]:
# run graph
start = time.process_time()

with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    # Total number of training examples present in a single batch.
    for iteration in range(int(n_epochs*train_set_size/batch_size)):
        x_batch, y_batch = get_next_batch(batch_size) # fetch the next training batch 
        sess.run(training_op, feed_dict={X: x_batch, y: y_batch}) 
        if iteration % int(5*train_set_size/batch_size) == 0:
            mse_train = loss.eval(feed_dict={X: x_train, y: y_train}) 
            mse_valid = loss.eval(feed_dict={X: x_valid, y: y_valid}) 
            print('%.2f epochs: MSE train/valid = %.6f/%.6f'%(
                iteration*batch_size/train_set_size, mse_train, mse_valid))

    y_train_pred = sess.run(outputs, feed_dict={X: x_train})
    y_valid_pred = sess.run(outputs, feed_dict={X: x_valid})
    y_test_pred = sess.run(outputs, feed_dict={X: x_test})
    
print('time taken for model traning: {} for epoch: {}, n_neurons: {}, batch_size: {}, learning_rate: {}, n_steps: {}'
      .format(time.process_time() - start, n_epochs, n_neurons, batch_size, learning_rate, n_steps ))

0.00 epochs: MSE train/valid = 1.033582/1.007673


KeyboardInterrupt: 

In [None]:
ft_list = []
for j in range(15):
    ft_list.append([j,  num_data_df.columns[j]])
print(ft_list)

In [None]:
ft = 4 #4 PM2.5

## show predictions
plt.figure(figsize=(35, 5));
plt.subplot(1,2,1);

plt.plot(np.arange(y_train.shape[0]), y_train[:,ft], color='blue', label='train target')

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_valid.shape[0]), y_valid[:,ft],
         color='gray', label='valid target')

plt.plot(np.arange(y_train.shape[0]+y_valid.shape[0],
                   y_train.shape[0]+y_test.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0]),y_train_pred[:,ft], color='red',
         label='test prediction')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_valid_pred.shape[0]),
         y_valid_pred[:,ft], color='orange', label='valid prediction')

plt.plot(np.arange(y_train_pred.shape[0]+y_valid_pred.shape[0],
                   y_train_pred.shape[0]+y_valid_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('past and future PM2.5 Level')
plt.xlabel('time [days]')
plt.ylabel('normalized PM2.5 Level')
plt.legend(loc='best');


In [None]:
plt.figure(figsize=(30, 15));
plt.subplot(1,1,1);

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('future PM2.5 ')
plt.xlabel('time [days]')
plt.ylabel('normalized PM2.5')
plt.legend(loc='best');

In [None]:
plt.figure(figsize=(30, 15));
plt.subplot(1,1,1);

plt.plot(np.arange(y_train.shape[0], y_train.shape[0]+y_test.shape[0]),
         y_test[:,ft], color='black', label='test target')

plt.plot(np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_test_pred.shape[0]),
         y_test_pred[:,ft], color='green', label='test prediction')

plt.title('future PM2.5 ')
plt.xlabel('time [days]')
plt.ylabel('normalized PM2.5')
plt.legend(loc='best');

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def RMSE(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [None]:
print('mean_squared_error',mean_squared_error(y_train[:,4], y_train_pred[:,4]))
print('r2_score', r2_score(y_train[:,4], y_train_pred[:,4]))
print('RMSE', RMSE(y_train[:,4], y_train_pred[:,4]))
print()


print('mean_squared_error',mean_squared_error(y_valid[:,4], y_valid_pred[:,4]))
print('r2_score', r2_score(y_valid[:,4], y_valid_pred[:,4]))
print('RMSE', RMSE(y_valid[:,4], y_valid_pred[:,4]))
print()

print('mean_squared_error',mean_squared_error(y_test[:,4], y_test_pred[:,4]))
print('r2_score', r2_score(y_test[:,4], y_test_pred[:,4]))
print('RMSE', RMSE(y_test[:,4], y_test_pred[:,4]))
print()

In [None]:
y_test_pred[:,4]

In [None]:
for i in range(len(y_test)):
    print(y_test[i, 4], y_test_pred[i,4])

### Linear Regression 

In [16]:

linearRegression=LinearRegression()
linearRegression.fit(X_train,y_train)

y_pred=linearRegression.predict(X_test)
linearRegression.score(X_test, y_test)


n_results=100
fig, ax=plt.subplots(2,1,figsize=(12,8))
ax[0].plot(y_test.values[:n_results], color="red")
ax[1].plot(y_pred[:n_results], color="green")

print('mean_squared_error',mean_squared_error(y_test, y_pred))
print('r2_score', r2_score(y_test, y_pred))


NameError: name 'X_train' is not defined