In [1]:
from __future__ import print_function, division
import pandas as pd
import numpy as np

import os
import math
from pandas import Series, DataFrame, Panel
import re
import random
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA

np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_loaded = pd.read_pickle("mumbai_6.pkl")

In [3]:
class BatchGenerator:
    
    def __init__(self, file, time_steps, scaler_type):
        print ("Loading data ...")
        
#         print('Processing CSV :', file)
#         data_temp = fill_gaps(data, "SR", 0.05)
#         data_temp = fill_gaps(data_temp, "WD", 2.0)
#         data_temp = fill_gaps(data_temp, "WS", 0.005)
#         data_temp = fill_gaps(data_temp, "VWS", 0.005)
#         data_temp = fill_gaps(data_temp, "BP", 0.1)
#         data_temp = fill_gaps(data_temp, "TEMP", 0.05)
#         data_temp = fill_gaps(data_temp, "PM2.5", 0.05)
#         data_temp = fill_gaps(data_temp, "RH", 0.05)
        
        data_loaded = pd.read_pickle(file)
        data_loaded.isnull().sum()
        data_loaded_np = data_loaded[["PM2.5","WS","RH","BP","VWS","SR","WD","TEMP"]].as_matrix()
        
        self.time_steps = time_steps
        self.scaler_type = scaler_type
        self.X_norm_pm, self.y_norm_pm, self.scaler_pm, self.min_max_scaler_pm = self.generate_batch_data(data_loaded_np[0:,0], time_steps=self.time_steps, name="pm25")
        self.X_norm_ws, self.y_norm_ws, self.scaler_ws, self.min_max_scaler_ws = self.generate_batch_data(data_loaded_np[0:,1], time_steps=self.time_steps, name="ws")
        self.X_norm_rh, self.y_norm_rh, self.scaler_rh, self.min_max_scaler_rh = self.generate_batch_data(data_loaded_np[0:,2], time_steps=self.time_steps, name="rh")
        self.X_norm_bp, self.y_norm_bp, self.scaler_bp, self.min_max_scaler_bp = self.generate_batch_data(data_loaded_np[0:,3], time_steps=self.time_steps, name="bp")
        self.X_norm_vws, self.y_norm_vws, self.scaler_vws, self.min_max_scaler_vws = self.generate_batch_data(data_loaded_np[0:,4], time_steps=self.time_steps, name="vws")
        self.X_norm_sr, self.y_norm_sr, self.scaler_sr, self.min_max_scaler_sr = self.generate_batch_data(data_loaded_np[0:,5], time_steps=self.time_steps, name="sr")
        self.X_norm_wd, self.y_norm_wd, self.scaler_wd, self.min_max_scaler_wd = self.generate_batch_data(data_loaded_np[0:,6], time_steps=self.time_steps, name="wd")
        self.X_norm_temp, self.y_norm_temp, self.scaler_temp, self.min_max_scaler_temp = self.generate_batch_data(data_loaded_np[0:,7], time_steps=self.time_steps, name="temp")

        if not (self.scaler_type is None):
            filename = "np_"+self.scaler_type+"_process_comp_"+str(self.time_steps)+".npz"
        else:
            filename = "np_process_comp_"+str(self.time_steps)+".npz"
        if os.path.isfile(filename):
            print ("Found existing file :",filename)
            print ("Loading ...")
            npzfile = np.load(filename)
            self.X_norm_pm = npzfile['arr_0']
            self.X = npzfile['arr_1']
            self.Y = npzfile['arr_2']
            print ("Complete.")
        else:
            self.X = np.array(np.zeros([1, 7]))
            for i in range(len(self.X_norm_pm)):
                temp = np.column_stack((self.X_norm_ws[i],self.X_norm_rh[i],self.X_norm_bp[i],self.X_norm_vws[i],self.X_norm_sr[i],self.X_norm_wd[i],self.X_norm_temp[i]))
                self.X = np.append(self.X, temp, axis=0)

            self.X = self.X[1:].reshape(len(self.X_norm_pm),48,7)
            self.Y = self.y_norm_pm

            print ("Input shape :",np.shape(self.X_norm_pm))
            print ("Aux Input shape :",np.shape(self.X))
            print ("Output shape :",np.shape(self.Y))
            print ("Saving file ...")
            np.savez(filename, self.X_norm_pm, self.X, self.Y)
            print ("Saved file to :", filename)
            print ("Complete.")
        
    def return_data(self):
        return self.X_norm_pm, self.X, self.Y, self.scaler_pm, self.min_max_scaler_pm
    
    def fill_gaps(self, data, col, sigma):
        temp = data.copy()
        temp["FROM"][0] = "00:00:00"

        mu, sigma = 0, sigma 
        for k in range(len(temp)):
            try:
                if (str(temp[col][k]) == str(np.nan)):
                    rolling_sum = 0
                    noise = np.random.normal(mu, sigma, 1)[0]
                    rolling_sum = rolling_sum + float(temp[col][k-24]) + float(temp[col][k-48]) + float(temp[col][k-72]) + float(temp[col][k-96]) + float(temp[col][k-120]) + float(temp[col][k-144]) + float(temp[col][k-168])
                    temp[col][k] = round(1.0*rolling_sum/7 + noise,2)
            except (IndexError, ValueError):
                print ("Break at index :", k)
                break
        return temp

    def shift(self, arr, num, fill_value=np.nan):
        result = np.empty_like(arr)
        if num > 0:
            result[:num] = fill_value
            result[num:] = arr[:-num]
        elif num < 0:
            result[num:] = fill_value
            result[:num] = arr[-num:]
        else:
            result = arr
        return result

    def generate_batch_data(self, raw_data, time_steps, name):
        series = pd.Series(raw_data, dtype=np.float32)
        # prepare data for standardization
        values = series.values
        values = values.reshape((len(values), 1))

        # train the standardization
        if self.scaler_type == 'standard':
            scaler = StandardScaler()
        if self.scaler_type == 'robust':
            scaler = RobustScaler()
        if self.scaler_type == 'min_max':
            scaler = MinMaxScaler(feature_range=(-1, 1))
        if self.scaler_type == 'robust_min_max':
            scaler = RobustScaler()
        if self.scaler_type == 'standard_min_max':
            scaler = StandardScaler()
            
        min_max_scaler = None    
        if not (self.scaler_type is None):
            scaler = scaler.fit(values)
        
        print('feature ------------ ', name.upper())
        if self.scaler_type == 'standard':
            print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, math.sqrt(scaler.var_)))
            values[values > 450] = 3*scaler.mean_
            print('Data normalized... Replaced the outliers with 3 times the mean value')
        if self.scaler_type == 'robust':
            print('Data normalized... Using Robust Scaling')
        if self.scaler_type == 'min_max':
            print('Data normalized... Using Min-Max Scaling')
       
        if not (self.scaler_type is None):
            normalized = scaler.transform(values)
        
        # min_max scaling
        if self.scaler_type == 'robust_min_max' or self.scaler_type == 'standard_min_max':
            min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
            min_max_scaler = min_max_scaler.fit(normalized)
            normalized = min_max_scaler.transform(normalized)
            if self.scaler_type == 'robust_min_max':
                print('Data normalized... Using Robust_Min-Max Scaling')
            if self.scaler_type == 'standard_min_max':
                print('Data normalized... Using Standard_Min-Max Scaling')
            
        # batch formation
        if (self.scaler_type is None):
            data = values
            scaler = None
        else:    
            data = normalized
        print('Max: %f, Min: %f' % (np.amax(data), np.amin(data)))
        x = data[:(len(data)-(len(data) % time_steps))]
        y = self.shift(data,-(time_steps)).astype(np.float32)

        x_batches = np.array([])
        y_batches = np.array([])

        # check if file exists
        if (self.scaler_type is None):
            seq_file_name = "np_processed_"+name+"_"+str(time_steps)+".npz"
        else:
            seq_file_name = "np_"+self.scaler_type+"_processed_"+name+"_"+str(time_steps)+".npz"           

        if os.path.isfile(seq_file_name):
            npzfile = np.load(seq_file_name)
            x_batches = npzfile['arr_0']
            y_batches = npzfile['arr_1']
            return x_batches, y_batches, scaler, min_max_scaler
        else: 
            for i in range(len(y)):
                try:
                    x_batches = np.append(x_batches, x[i:i+(time_steps)].reshape(-1,12,1))
                    y_batches = np.append(y_batches, y[i].reshape(-1))
                except ValueError:
                    break
                    
            x_batches = x_batches.reshape(-1, time_steps, 1)
            y_batches = y_batches.reshape(-1)
            np.savez(seq_file_name, x_batches, y_batches)
            return x_batches, y_batches, scaler, min_max_scaler

In [15]:
batch_generator_obj = BatchGenerator(file="mumbai_6.pkl", time_steps=48, scaler_type="standard")
X_norm, X_aux_norm, y_norm, scaler, min_max_scaler = batch_generator_obj.return_data()

Loading data ...
feature ------------  PM25
Mean: 46.526001, StandardDeviation: 34.564391
Data normalized... Replaced the outliers with 3 times the mean value
Max: 11.109526, Min: -1.345778
feature ------------  WS
Mean: 0.314379, StandardDeviation: 1.000446
Data normalized... Replaced the outliers with 3 times the mean value
Max: 43.346268, Min: -0.304243
feature ------------  RH
Mean: 69.464447, StandardDeviation: 18.077896
Data normalized... Replaced the outliers with 3 times the mean value
Max: 1.690769, Min: -3.770043
feature ------------  BP
Mean: 762.825073, StandardDeviation: 8.665857
Data normalized... Replaced the outliers with 3 times the mean value
Max: 176.052979, Min: -54.197186
feature ------------  VWS
Mean: 0.614455, StandardDeviation: 0.861468
Data normalized... Replaced the outliers with 3 times the mean value
Max: 55.690453, Min: -0.690049
feature ------------  SR
Mean: 126.844894, StandardDeviation: 204.627930
Data normalized... Replaced the outliers with 3 times t

In [16]:
print(np.shape(X_norm))

(19633, 48, 1)


In [17]:
np.shape(X_aux_norm[0])

(48, 7)

In [18]:
X_aux_norm[0]

array([[ -1.64305404e-01,  -6.04298532e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.12403691e-01,  -1.98137447e-01,
         -1.89721286e+00],
       [ -1.64305404e-01,  -6.99995458e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15482390e-01,  -6.46089911e-02,
         -2.12038064e+00],
       [ -2.34274179e-01,  -6.57955289e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.14944875e-01,  -1.53978348e-01,
         -2.16721845e+00],
       [ -7.43455812e-02,  -7.16037214e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15286946e-01,  -1.86571941e-01,
         -2.19477010e+00],
       [ -2.14283109e-01,  -4.54944819e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15140319e-01,  -7.53333643e-02,
         -2.47028637e+00],
       [ -2.54265249e-01,  -3.14994991e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.14798248e-01,  -2.12541640e-01,
         -2.50610304e+00],
       [ -1.94292024e-01,  -9.64960605e-02,   1.76052979e+02,
         -1.67794507e-02,  -6.15

In [19]:
scaler = Normalizer().fit(X_aux_norm[0])
normalizedX = scaler.transform(X_aux_norm[0])

In [20]:
normalizedX

array([[ -9.33206173e-04,  -3.43223721e-03,   9.99928928e-01,
         -9.53023249e-05,  -3.47827212e-03,  -1.12536219e-03,
         -1.07756088e-02],
       [ -9.33191270e-04,  -3.97570400e-03,   9.99912960e-01,
         -9.53008029e-05,  -3.49570239e-03,  -3.66954130e-04,
         -1.20429435e-02],
       [ -1.33058281e-03,  -3.73692058e-03,   9.99909887e-01,
         -9.53005101e-05,  -3.49263878e-03,  -8.74534891e-04,
         -1.23089264e-02],
       [ -4.22251802e-04,  -4.06679185e-03,   9.99907272e-01,
         -9.53002609e-05,  -3.49457247e-03,  -1.05965058e-03,
         -1.24653760e-02],
       [ -1.21701897e-03,  -2.58385498e-03,   9.99891296e-01,
         -9.52987382e-05,  -3.49368387e-03,  -4.27855160e-04,
         -1.40299691e-02],
       [ -1.44409419e-03,  -1.78900749e-03,   9.99889227e-01,
         -9.52985410e-05,  -3.49173385e-03,  -1.20712582e-03,
         -1.42333601e-02],
       [ -1.10347056e-03,  -5.48043918e-04,   9.99882934e-01,
         -9.52979412e-05,  -3.49

In [28]:
pca = PCA(n_components=7, svd_solver='full')
pca = pca.fit(X_aux_norm)
normalizedX_new = pca.transform(X_aux_norm)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [26]:
normalizedX_new

array([[  4.36709616e-02,   2.41054944e-01,   4.38451887e-01,
         -8.27200615e-02,   3.25770484e-02,  -4.09333449e-02,
          1.54795513e-17],
       [  1.53021716e-01,   4.04193318e-01,   6.23370242e-01,
         -2.18961417e-02,   3.88182032e-02,  -2.37266755e-02,
          1.49752066e-17],
       [  2.11992622e-01,   3.11448533e-01,   6.40357386e-01,
         -1.92229357e-02,  -2.60603264e-02,  -7.60648314e-03,
          1.47829637e-17],
       [  1.79574899e-01,   2.92440831e-01,   7.03964345e-01,
          9.70511537e-03,   1.27876665e-01,  -3.55482616e-02,
          1.60112871e-17],
       [  5.64891778e-01,   3.66188665e-01,   6.65403122e-01,
          7.95474002e-02,   1.12213854e-02,   1.08608056e-02,
          8.90275543e-18],
       [  6.73158207e-01,   2.06474495e-01,   6.12214208e-01,
          8.57524004e-02,  -2.20754461e-02,   2.01940942e-02,
          7.39730646e-18],
       [  9.00744669e-01,   3.44706375e-01,   4.82928444e-01,
          1.36925454e-01,   4.72

In [27]:
X_aux_norm[0]

array([[ -1.64305404e-01,  -6.04298532e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.12403691e-01,  -1.98137447e-01,
         -1.89721286e+00],
       [ -1.64305404e-01,  -6.99995458e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15482390e-01,  -6.46089911e-02,
         -2.12038064e+00],
       [ -2.34274179e-01,  -6.57955289e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.14944875e-01,  -1.53978348e-01,
         -2.16721845e+00],
       [ -7.43455812e-02,  -7.16037214e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15286946e-01,  -1.86571941e-01,
         -2.19477010e+00],
       [ -2.14283109e-01,  -4.54944819e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.15140319e-01,  -7.53333643e-02,
         -2.47028637e+00],
       [ -2.54265249e-01,  -3.14994991e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.14798248e-01,  -2.12541640e-01,
         -2.50610304e+00],
       [ -1.94292024e-01,  -9.64960605e-02,   1.76052979e+02,
         -1.67794507e-02,  -6.15

In [35]:
data_loaded = pd.read_pickle("mumbai_6.pkl")
data_loaded_np = data_loaded[["PM2.5","WS","RH","BP","VWS","SR","WD","TEMP"]].as_matrix()

scale = StandardScaler()

data_loaded_np_scaled = scale.fit_transform(data_loaded_np)

In [40]:
data_loaded

Unnamed: 0,FROM,TO,DATE,PM2.5,WS,RH,BP,VWS,SR,WD,TEMP
0,00:00:00,00:59:00,01/01/2015,64.31,0.15,58.54,762.13,0.60,1.53,146.87,25.07
1,01:00:00,01:59:00,01/01/2015,82.22,0.15,56.81,761.72,0.60,0.90,159.57,24.26
2,02:00:00,02:59:00,01/01/2015,81.38,0.08,57.57,761.23,0.60,1.01,151.07,24.09
3,03:00:00,03:59:00,01/01/2015,74.56,0.24,56.52,760.96,0.60,0.94,147.97,23.99
4,04:00:00,04:59:00,01/01/2015,70.49,0.10,61.24,760.99,0.60,0.97,158.55,22.99
5,05:00:00,05:59:00,01/01/2015,65.95,0.06,63.77,761.11,0.60,1.04,145.50,22.86
6,06:00:00,06:59:00,01/01/2015,77.63,0.12,67.72,761.45,0.60,0.87,162.56,22.47
7,07:00:00,07:59:00,01/01/2015,71.01,0.05,66.01,762.13,0.60,3.21,150.14,22.84
8,08:00:00,08:59:00,01/01/2015,67.88,0.10,64.83,763.03,0.60,28.28,154.49,24.06
9,09:00:00,09:59:00,01/01/2015,75.69,0.17,60.64,763.75,0.56,59.16,150.66,26.08


In [41]:
data_loaded_np


array([[   64.31,     0.15,    58.54,   762.13,     0.6 ,     1.53,
          146.87,    25.07],
       [   82.22,     0.15,    56.81,   761.72,     0.6 ,     0.9 ,
          159.57,    24.26],
       [   81.38,     0.08,    57.57,   761.23,     0.6 ,     1.01,
          151.07,    24.09],
       [   74.56,     0.24,    56.52,   760.96,     0.6 ,     0.94,
          147.97,    23.99],
       [   70.49,     0.1 ,    61.24,   760.99,     0.6 ,     0.97,
          158.55,    22.99],
       [   65.95,     0.06,    63.77,   761.11,     0.6 ,     1.04,
          145.5 ,    22.86],
       [   77.63,     0.12,    67.72,   761.45,     0.6 ,     0.87,
          162.56,    22.47],
       [   71.01,     0.05,    66.01,   762.13,     0.6 ,     3.21,
          150.14,    22.84],
       [   67.88,     0.1 ,    64.83,   763.03,     0.6 ,    28.28,
          154.49,    24.06],
       [   75.69,     0.17,    60.64,   763.75,     0.56,    59.16,
          150.66,    26.08],
       [   72.61,     0.2 ,   

In [39]:
data_loaded_np_scaled
-1.64305404e-01,  -6.04298532e-01,   1.76052979e+02,
         -1.67794507e-02,  -6.12403691e-01,  -1.98137447e-01,
         -1.89721286e+00

array([[  0.515,  -0.164,  -0.604,  -0.08 ,  -0.017,  -0.612,  -0.198,
         -1.897],
       [  1.033,  -0.164,  -0.7  ,  -0.128,  -0.017,  -0.615,  -0.065,
         -2.12 ],
       [  1.008,  -0.234,  -0.658,  -0.184,  -0.017,  -0.615,  -0.154,
         -2.167],
       [  0.811,  -0.074,  -0.716,  -0.215,  -0.017,  -0.615,  -0.187,
         -2.195],
       [  0.693,  -0.214,  -0.455,  -0.212,  -0.017,  -0.615,  -0.075,
         -2.47 ],
       [  0.562,  -0.254,  -0.315,  -0.198,  -0.017,  -0.615,  -0.213,
         -2.506],
       [  0.9  ,  -0.194,  -0.096,  -0.159,  -0.017,  -0.616,  -0.033,
         -2.614],
       [  0.708,  -0.264,  -0.191,  -0.08 ,  -0.017,  -0.604,  -0.164,
         -2.512],
       [  0.618,  -0.214,  -0.256,   0.024,  -0.017,  -0.482,  -0.118,
         -2.175],
       [  0.844,  -0.144,  -0.488,   0.107,  -0.063,  -0.331,  -0.158,
         -1.619],
       [  0.755,  -0.114,  -0.727,   0.1  ,  -0.144,  -0.194,  -1.458,
         -1.167],
       [  3.41 ,  -0.

In [42]:
pca = PCA(n_components=8, svd_solver='full')
pca = pca.fit(data_loaded_np_scaled)
data_loaded_np_new = pca.transform(data_loaded_np_scaled)

In [45]:
scaler = Normalizer().fit(data_loaded_np_scaled)
normalizedX = scaler.transform(data_loaded_np_scaled)

In [46]:
normalizedX

array([[ 0.238, -0.076, -0.279, -0.037, -0.008, -0.283, -0.092, -0.877],
       [ 0.406, -0.065, -0.275, -0.05 , -0.007, -0.242, -0.025, -0.833],
       [ 0.391, -0.091, -0.255, -0.071, -0.007, -0.239, -0.06 , -0.841],
       [ 0.319, -0.029, -0.282, -0.085, -0.007, -0.242, -0.073, -0.864],
       [ 0.257, -0.08 , -0.169, -0.079, -0.006, -0.228, -0.028, -0.916],
       [ 0.209, -0.095, -0.117, -0.074, -0.006, -0.229, -0.079, -0.932],
       [ 0.316, -0.068, -0.034, -0.056, -0.006, -0.216, -0.012, -0.919],
       [ 0.262, -0.098, -0.071, -0.03 , -0.006, -0.223, -0.061, -0.929],
       [ 0.264, -0.092, -0.11 ,  0.01 , -0.007, -0.206, -0.05 , -0.93 ],
       [ 0.436, -0.075, -0.252,  0.055, -0.033, -0.171, -0.082, -0.837],
       [ 0.349, -0.053, -0.336,  0.046, -0.067, -0.09 , -0.675, -0.54 ],
       [ 0.915, -0.047, -0.225,  0.022, -0.051,  0.017, -0.227, -0.234],
       [ 0.947,  0.02 , -0.217,  0.003, -0.036,  0.03 ,  0.032, -0.231],
       [ 0.404, -0.038, -0.494, -0.053, -0.109,  0.

In [48]:
np.shape(data_loaded_np[0:,0])

(19704,)

In [10]:
class BatchGenerator:
    
    def __init__(self, file, time_steps=48, scaler_type='standard', pca=False, pca_dim=8, normal=False):
        self.file = file
        self.time_steps = time_steps
        self.scaler_type = scaler_type
        self.pca = pca
        self.pca_dim = pca_dim
        self.normal = normal
        
        print ("Loading data ...")
        data_loaded = pd.read_pickle(self.file)
        data_loaded.isnull().sum()
       
        # Standardization
        if self.scaler_type == 'standard':
            self.scaler = StandardScaler()
            print('Data normalized... Using Standard Scaling')
        if self.scaler_type == 'robust':
            self.scaler = RobustScaler()
            print('Data normalized... Using Robust Scaling')
        if self.scaler_type == 'min_max':
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            print('Data normalized... Using Min-Max Scaling')
       
        data_loaded_np = data_loaded[["PM2.5","WS","RH","BP","VWS","SR","WD","TEMP"]].as_matrix()
        self.X_norm_ori_pm, self.y_norm_ori_pm = self.generate_batch_data(data_loaded_np[0:,0], time_steps=self.time_steps, name="pm25_ori")
        
        data_loaded_np = self.scaler.fit_transform(data_loaded_np)
       
        if self.pca == True:
            print("PCA transform")
            pca = PCA(n_components=self.pca_dim, svd_solver='full')
            pca = pca.fit(data_loaded_np)
            data_loaded_np = pca.transform(data_loaded_np)
        if self.normal == True:
            print("Normalize transform")
            self.norm_scaler = Normalizer().fit(data_loaded_np)
            data_loaded_np = self.norm_scaler.transform(data_loaded_np)

        self.X_norm_pm, self.y_norm_pm = self.generate_batch_data(data_loaded_np[0:,0], time_steps=self.time_steps, name="pm25")
        self.X_norm_ws, self.y_norm_ws = self.generate_batch_data(data_loaded_np[0:,1], time_steps=self.time_steps, name="ws")
        self.X_norm_rh, self.y_norm_rh = self.generate_batch_data(data_loaded_np[0:,2], time_steps=self.time_steps, name="rh")
        self.X_norm_bp, self.y_norm_bp = self.generate_batch_data(data_loaded_np[0:,3], time_steps=self.time_steps, name="bp")
        self.X_norm_vws, self.y_norm_vws = self.generate_batch_data(data_loaded_np[0:,4], time_steps=self.time_steps, name="vws")
        self.X_norm_sr, self.y_norm_sr = self.generate_batch_data(data_loaded_np[0:,5], time_steps=self.time_steps, name="sr")
        self.X_norm_wd, self.y_norm_wd = self.generate_batch_data(data_loaded_np[0:,6], time_steps=self.time_steps, name="wd")
        self.X_norm_temp, self.y_norm_temp = self.generate_batch_data(data_loaded_np[0:,7], time_steps=self.time_steps, name="temp")

        if not (self.scaler_type is None):
            filename = "np_"+self.scaler_type+"_process_comp_"+str(self.time_steps)+"_"+str(self.pca)+"_"+str(self.normal)+".npz"
        else:
            filename = "np_process_comp_"+str(self.time_steps)+"_"+str(self.pca)+"_"+str(self.normal)+".npz"

        if os.path.isfile("data/"+filename):
            print ("Found existing file :",filename)
            print ("Loading ...")
            npzfile = np.load("data/"+filename)
            self.X_norm_pm = npzfile['arr_0']
            self.X = npzfile['arr_1']
            self.Y = npzfile['arr_2']
            print ("Complete.")
        else:
            self.X = np.array(np.zeros([1, 7]))
            for i in range(len(self.X_norm_pm)):
                temp = np.column_stack((self.X_norm_ws[i],self.X_norm_rh[i],self.X_norm_bp[i],self.X_norm_vws[i],self.X_norm_sr[i],self.X_norm_wd[i],self.X_norm_temp[i]))
                self.X = np.append(self.X, temp, axis=0)

            self.X = self.X[1:].reshape(len(self.X_norm_pm),48,7)
            self.Y = self.y_norm_ori_pm

            print ("Input shape :",np.shape(self.X_norm_pm))
            print ("Aux Input shape :",np.shape(self.X))
            print ("Output shape :",np.shape(self.Y))
            print ("Saving file ...")
            np.savez("data/"+filename, self.X_norm_pm, self.X, self.Y)
            print ("Saved file to :", filename)
            print ("Complete.")
        
    def return_data(self):
        return self.X_norm_pm, self.X, self.Y, self.scaler

    def shift(self, arr, num, fill_value=np.nan):
        result = np.empty_like(arr)
        if num > 0:
            result[:num] = fill_value
            result[num:] = arr[:-num]
        elif num < 0:
            result[num:] = fill_value
            result[:num] = arr[-num:]
        else:
            result = arr
        return result

    def generate_batch_data(self, raw_data, time_steps, name):
        series = pd.Series(raw_data, dtype=np.float32)
        values = series.values
        values = values.reshape((len(values), 1))
        print('feature ------------ ', name.upper())
        
        # batch formation
        data = values
        print('Max: %f, Min: %f' % (np.amax(data), np.amin(data)))
        x = data[:(len(data)-(len(data) % time_steps))]
        y = self.shift(data,-(time_steps)).astype(np.float32)

        x_batches = np.array([])
        y_batches = np.array([])

        # check if file exists
        if (self.scaler_type is None):
            seq_file_name = "np_processed_"+name+"_"+str(time_steps)+"_"+str(self.pca)+"_"+str(self.normal)+".npz"
        else:
            seq_file_name = "np_"+self.scaler_type+"_processed_"+name+"_"+str(time_steps)+"_"+str(self.pca)+"_"+str(self.normal)+".npz"          

        if os.path.isfile("data/"+seq_file_name):
            npzfile = np.load("data/"+seq_file_name)
            x_batches = npzfile['arr_0']
            y_batches = npzfile['arr_1']
            return x_batches, y_batches
        else: 
            for i in range(len(y)):
                try:
                    x_batches = np.append(x_batches, x[i:i+(time_steps)].reshape(-1,12,1))
                    y_batches = np.append(y_batches, y[i].reshape(-1))
                except ValueError:
                    break
                    
            x_batches = x_batches.reshape(-1, time_steps, 1)
            y_batches = y_batches.reshape(-1)
            np.savez("data/"+seq_file_name, x_batches, y_batches)
            return x_batches, y_batches

In [13]:
batch_generator_obj = BatchGenerator(file="mumbai_6.pkl", time_steps=48, scaler_type="standard", pca=True)
X_norm, X_aux_norm, y_norm, scaler = batch_generator_obj.return_data()

Loading data ...
Data normalized... Using Standard Scaling
feature ------------  PM25_ORI
Max: 961.280029, Min: 0.010000
PCA transform
feature ------------  PM25
Max: 45.802834, Min: -2.851475
feature ------------  WS
Max: 61.763958, Min: -6.680837
feature ------------  RH
Max: 25.932737, Min: -2.787551
feature ------------  BP
Max: 16.831656, Min: -2.283291
feature ------------  VWS
Max: 35.946007, Min: -28.765430
feature ------------  SR
Max: 40.415234, Min: -26.081017
feature ------------  WD
Max: 10.734898, Min: -8.061556
feature ------------  TEMP
Max: 7.157158, Min: -6.580746
Found existing file : np_standard_process_comp_48_True_False.npz
Loading ...
Complete.


In [16]:
X_norm[0]

array([[-1.242],
       [-1.314],
       [-1.373],
       [-1.303],
       [-1.554],
       [-1.647],
       [-1.74 ],
       [-1.712],
       [-1.448],
       [-1.027],
       [-0.832],
       [-0.516],
       [-0.233],
       [-0.002],
       [-0.219],
       [-0.514],
       [-0.323],
       [-0.671],
       [-0.937],
       [-1.173],
       [-1.045],
       [-1.166],
       [-1.099],
       [-1.447],
       [-1.515],
       [-1.659],
       [-1.731],
       [-1.966],
       [-1.958],
       [-2.135],
       [-2.221],
       [-2.178],
       [-1.742],
       [-1.169],
       [-0.904],
       [-0.517],
       [-0.401],
       [-0.074],
       [-0.352],
       [-0.622],
       [-0.504],
       [-0.917],
       [-1.028],
       [-1.477],
       [-1.597],
       [-1.735],
       [-1.848],
       [-1.981]])

In [None]:
class BatchGenerator_bk:
    
    def __init__(self, file, time_steps, scaler_type):
        print ("Loading data ...")
        
#         print('Processing CSV :', file)
#         data_temp = fill_gaps(data, "SR", 0.05)
#         data_temp = fill_gaps(data_temp, "WD", 2.0)
#         data_temp = fill_gaps(data_temp, "WS", 0.005)
#         data_temp = fill_gaps(data_temp, "VWS", 0.005)
#         data_temp = fill_gaps(data_temp, "BP", 0.1)
#         data_temp = fill_gaps(data_temp, "TEMP", 0.05)
#         data_temp = fill_gaps(data_temp, "PM2.5", 0.05)
#         data_temp = fill_gaps(data_temp, "RH", 0.05)
        
        data_loaded = pd.read_pickle(file)
        data_loaded.isnull().sum()
        data_loaded_np = data_loaded[["PM2.5","WS","RH","BP","VWS","SR","WD","TEMP"]].as_matrix()
        
        self.time_steps = time_steps
        self.scaler_type = scaler_type
        self.X_norm_pm, self.y_norm_pm, self.scaler_pm, self.min_max_scaler_pm = self.generate_batch_data(data_loaded_np[0:,0], time_steps=self.time_steps, name="pm25")
        self.X_norm_ws, self.y_norm_ws, self.scaler_ws, self.min_max_scaler_ws = self.generate_batch_data(data_loaded_np[0:,1], time_steps=self.time_steps, name="ws")
        self.X_norm_rh, self.y_norm_rh, self.scaler_rh, self.min_max_scaler_rh = self.generate_batch_data(data_loaded_np[0:,2], time_steps=self.time_steps, name="rh")
        self.X_norm_bp, self.y_norm_bp, self.scaler_bp, self.min_max_scaler_bp = self.generate_batch_data(data_loaded_np[0:,4], time_steps=self.time_steps, name="bp")
        self.X_norm_vws, self.y_norm_vws, self.scaler_vws, self.min_max_scaler_vws = self.generate_batch_data(data_loaded_np[0:,3], time_steps=self.time_steps, name="vws")
        self.X_norm_sr, self.y_norm_sr, self.scaler_sr, self.min_max_scaler_sr = self.generate_batch_data(data_loaded_np[0:,5], time_steps=self.time_steps, name="sr")
        self.X_norm_wd, self.y_norm_wd, self.scaler_wd, self.min_max_scaler_wd = self.generate_batch_data(data_loaded_np[0:,6], time_steps=self.time_steps, name="wd")
        self.X_norm_temp, self.y_norm_temp, self.scaler_temp, self.min_max_scaler_temp = self.generate_batch_data(data_loaded_np[0:,7], time_steps=self.time_steps, name="temp")

        filename = "np_"+self.scaler_type+"_process_comp_"+str(self.time_steps)+".npz"
        if os.path.isfile(filename):
            print ("Found existing file :",filename)
            print ("Loading ...")
            npzfile = np.load(filename)
            self.X_norm_pm = npzfile['arr_0']
            self.X = npzfile['arr_1']
            self.Y = npzfile['arr_2']
            print ("Complete.")
        else:
            self.X = np.array(np.zeros([1, 7]))
            for i in range(len(self.X_norm_pm)):
                temp = np.column_stack((self.X_norm_ws[i],self.X_norm_rh[i],self.X_norm_bp[i],self.X_norm_vws[i],self.X_norm_sr[i],self.X_norm_wd[i],self.X_norm_temp[i]))
                self.X = np.append(self.X, temp, axis=0)

            self.X = self.X[1:].reshape(len(self.X_norm_pm),48,7)
            self.Y = self.y_norm_pm
            
            print ("Input shape :",np.shape(self.X_norm_pm))
            print ("Aux Input shape :",np.shape(self.X))
            print ("Output shape :",np.shape(self.Y))
            print ("Saving file ...")
            np.savez(filename, self.X_norm_pm, self.X, self.Y)
            print ("Saved file to :", filename)
            print ("Complete.")
        
    def return_data(self):
        return self.X_norm_pm, self.X, self.Y, self.scaler_pm, self.min_max_scaler_pm
    
    def fill_gaps(self, data, col, sigma):
        temp = data.copy()
        temp["FROM"][0] = "00:00:00"

        mu, sigma = 0, sigma 
        for k in range(len(temp)):
            try:
                if (str(temp[col][k]) == str(np.nan)):
                    rolling_sum = 0
                    noise = np.random.normal(mu, sigma, 1)[0]
                    rolling_sum = rolling_sum + float(temp[col][k-24]) + float(temp[col][k-48]) + float(temp[col][k-72]) + float(temp[col][k-96]) + float(temp[col][k-120]) + float(temp[col][k-144]) + float(temp[col][k-168])
                    temp[col][k] = round(1.0*rolling_sum/7 + noise,2)
            except (IndexError, ValueError):
                print ("Break at index :", k)
                break
        return temp

    def shift(self, arr, num, fill_value=np.nan):
        result = np.empty_like(arr)
        if num > 0:
            result[:num] = fill_value
            result[num:] = arr[:-num]
        elif num < 0:
            result[num:] = fill_value
            result[:num] = arr[-num:]
        else:
            result = arr
        return result

    def generate_batch_data(self, raw_data, time_steps, name):
        series = pd.Series(raw_data, dtype=np.float32)
        # prepare data for standardization
        values = series.values
        values = values.reshape((len(values), 1))

        # train the standardization
        if self.scaler_type == 'standard':
            scaler = StandardScaler()
        if self.scaler_type == 'robust':
            scaler = RobustScaler()
        if self.scaler_type == 'min_max':
            scaler = MinMaxScaler(feature_range=(0, 1))
        if self.scaler_type == 'robust_min_max':
            scaler = RobustScaler()
        if self.scaler_type == 'standard_min_max':
            scaler = StandardScaler()
            
        min_max_scaler = None    
        scaler = scaler.fit(values)
        print('feature ------------ ', name.upper())
        
        if self.scaler_type == 'standard':
            print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, math.sqrt(scaler.var_)))
            values[values > 450] = 3*scaler.mean_
            print('Data normalized... Replaced the outliers with 3 times the mean value')
        if self.scaler_type == 'robust':
            print('Data normalized... Using Robust Scaling')
        if self.scaler_type == 'min_max':
            print('Data normalized... Using Min-Max Scaling')
       
        normalized = scaler.transform(values)
        
        # min_max scaling
        if self.scaler_type == 'robust_min_max' or self.scaler_type == 'standard_min_max':
            min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
            min_max_scaler = min_max_scaler.fit(normalized)
            normalized = min_max_scaler.transform(normalized)
            if self.scaler_type == 'robust_min_max':
                print('Data normalized... Using Robust_Min-Max Scaling')
            if self.scaler_type == 'standard_min_max':
                print('Data normalized... Using Standard_Min-Max Scaling')
            
        # batch formation
        data = normalized
        print('Max: %f, Min: %f' % (np.amax(data), np.amin(data)))
        x = data[:(len(data)-(len(data) % time_steps))]
        y = self.shift(data,-(time_steps)).astype(np.float32)

        x_batches = np.array([])
        y_batches = np.array([])

        # check if file exists
        seq_file_name = "np_"+self.scaler_type+"_processed_"+name+"_"+str(time_steps)+".npz"
        if os.path.isfile(seq_file_name):
            npzfile = np.load(seq_file_name)
            x_batches = npzfile['arr_0']
            y_batches = npzfile['arr_1']
            return x_batches, y_batches, scaler, min_max_scaler
        else: 
            for i in range(len(y)):
                try:
                    x_batches = np.append(x_batches, x[i:i+time_steps].reshape(-1,12,1))
                    y_batches = np.append(y_batches, y[i].reshape(-1))
                except ValueError:
                    break
            x_batches = x_batches.reshape(-1, time_steps, 1)
            y_batches = y_batches.reshape(-1)
            np.savez(seq_file_name, x_batches, y_batches)
            return x_batches, y_batches, scaler, min_max_scaler