In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('pollution.csv')
data.head(10)

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0
5,2010-01-02 05:00:00,109.0,-7,-6.0,1022.0,SE,7.14,3,0
6,2010-01-02 06:00:00,105.0,-7,-6.0,1023.0,SE,8.93,4,0
7,2010-01-02 07:00:00,124.0,-7,-5.0,1024.0,SE,10.72,0,0
8,2010-01-02 08:00:00,120.0,-8,-6.0,1024.0,SE,12.51,0,0
9,2010-01-02 09:00:00,132.0,-7,-5.0,1025.0,SE,14.3,0,0


In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
data = series_to_supervised(data, 1, 1)

In [5]:
data.head(10)

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t),var9(t)
1,2010-01-02 00:00:00,129.0,-16.0,-4.0,1020.0,SE,1.79,0.0,0.0,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 01:00:00,148.0,-15.0,-4.0,1020.0,SE,2.68,0.0,0.0,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 02:00:00,159.0,-11.0,-5.0,1021.0,SE,3.57,0.0,0.0,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 03:00:00,181.0,-7.0,-5.0,1022.0,SE,5.36,1.0,0.0,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0
5,2010-01-02 04:00:00,138.0,-7.0,-5.0,1022.0,SE,6.25,2.0,0.0,2010-01-02 05:00:00,109.0,-7,-6.0,1022.0,SE,7.14,3,0
6,2010-01-02 05:00:00,109.0,-7.0,-6.0,1022.0,SE,7.14,3.0,0.0,2010-01-02 06:00:00,105.0,-7,-6.0,1023.0,SE,8.93,4,0
7,2010-01-02 06:00:00,105.0,-7.0,-6.0,1023.0,SE,8.93,4.0,0.0,2010-01-02 07:00:00,124.0,-7,-5.0,1024.0,SE,10.72,0,0
8,2010-01-02 07:00:00,124.0,-7.0,-5.0,1024.0,SE,10.72,0.0,0.0,2010-01-02 08:00:00,120.0,-8,-6.0,1024.0,SE,12.51,0,0
9,2010-01-02 08:00:00,120.0,-8.0,-6.0,1024.0,SE,12.51,0.0,0.0,2010-01-02 09:00:00,132.0,-7,-5.0,1025.0,SE,14.3,0,0
10,2010-01-02 09:00:00,132.0,-7.0,-5.0,1025.0,SE,14.3,0.0,0.0,2010-01-02 10:00:00,140.0,-7,-5.0,1026.0,SE,17.43,1,0


In [6]:
data.columns

Index(['var1(t-1)', 'var2(t-1)', 'var3(t-1)', 'var4(t-1)', 'var5(t-1)',
       'var6(t-1)', 'var7(t-1)', 'var8(t-1)', 'var9(t-1)', 'var1(t)',
       'var2(t)', 'var3(t)', 'var4(t)', 'var5(t)', 'var6(t)', 'var7(t)',
       'var8(t)', 'var9(t)'],
      dtype='object')

In [7]:
data.drop(data.columns[[9, 11, 12, 13, 14, 15, 16, 17]], axis=1, inplace=True)

In [8]:
data.head(10)

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var2(t)
1,2010-01-02 00:00:00,129.0,-16.0,-4.0,1020.0,SE,1.79,0.0,0.0,148.0
2,2010-01-02 01:00:00,148.0,-15.0,-4.0,1020.0,SE,2.68,0.0,0.0,159.0
3,2010-01-02 02:00:00,159.0,-11.0,-5.0,1021.0,SE,3.57,0.0,0.0,181.0
4,2010-01-02 03:00:00,181.0,-7.0,-5.0,1022.0,SE,5.36,1.0,0.0,138.0
5,2010-01-02 04:00:00,138.0,-7.0,-5.0,1022.0,SE,6.25,2.0,0.0,109.0
6,2010-01-02 05:00:00,109.0,-7.0,-6.0,1022.0,SE,7.14,3.0,0.0,105.0
7,2010-01-02 06:00:00,105.0,-7.0,-6.0,1023.0,SE,8.93,4.0,0.0,124.0
8,2010-01-02 07:00:00,124.0,-7.0,-5.0,1024.0,SE,10.72,0.0,0.0,120.0
9,2010-01-02 08:00:00,120.0,-8.0,-6.0,1024.0,SE,12.51,0.0,0.0,132.0
10,2010-01-02 09:00:00,132.0,-7.0,-5.0,1025.0,SE,14.3,0.0,0.0,140.0


In [9]:
def split_data(x, y, train_size):
    x_train = x[:int(len(x)*train_size)]
    x_test = x[int(len(x)*train_size):]
    y_train = y[:int(len(y)*train_size)]
    y_test = y[int(len(y)*train_size):]
    
    return x_train, x_test, y_train, y_test

In [10]:
x = data.loc[:,[
    'var2(t-1)',
    'var3(t-1)',
    'var4(t-1)',
    'var5(t-1)',
    'var6(t-1)',
    'var7(t-1)',
    'var8(t-1)',
    'var9(t-1)'
]]

y = data.loc[:,['var2(t)']]

In [12]:
x_train, x_test, y_train, y_test = split_data(x, y, train_size=0.8)

In [24]:
x_names = ['var2(t-1)','var3(t-1)','var4(t-1)','var5(t-1)','var6(t-1)','var7(t-1)','var8(t-1)','var9(t-1)']
y_names = 'var2(t)'

In [22]:
train = pd.DataFrame(x_train, columns=x_names)

In [26]:
train.insert(8, y_names, y_train)

In [28]:
test = pd.DataFrame(x_test, columns=x_names)

In [29]:
test.insert(8, y_names, y_test)

In [32]:
train.to_csv('air_pollution_train.csv')
test.to_csv('air_pollution_test.csv')