In [61]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [62]:
TIMESERIES = "data/timeseries.csv"

In [63]:
df = pd.read_csv(TIMESERIES, parse_dates=['Date', 'DateIdx'])

In [64]:
len(df)

35040

In [65]:
sum(df.DateIdx == df.Date)

35040

In [66]:
cols = df.columns

In [67]:
cols

Index([u'DateIdx', u'Date', u'VAR', u'dewpti', u'hum', u'pressurei', u'rain',
       u'tempi', u'thunder', u'Day', u'Holiday', u'PCA_1'],
      dtype='object')

In [68]:
df = df[cols[1:]]

In [69]:
df.head()

Unnamed: 0,Date,VAR,dewpti,hum,pressurei,rain,tempi,thunder,Day,Holiday,PCA_1
0,2016-05-01 00:00:00,24.25751,62.3,82.0,29.96,0.0,67.5,0.0,1,0,-15.419709
1,2016-05-01 00:15:00,25.38191,49.85,84.458356,29.832557,0.0,67.257999,0.0,1,0,198.166945
2,2016-05-01 00:30:00,25.10542,62.6,88.0,29.95,0.0,66.2,0.0,1,0,104.502454
3,2016-05-01 00:45:00,24.22065,62.45,85.75,29.95,0.0,66.4,0.0,1,0,119.514737
4,2016-05-01 01:00:00,24.18378,62.3,83.5,29.95,0.0,66.6,0.0,1,0,134.532242


In [70]:
df.describe()

Unnamed: 0,VAR,dewpti,hum,pressurei,rain,tempi,thunder,Day,Holiday,PCA_1
count,34368.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0,35040.0
mean,24.551156,52.720154,61.963899,29.97941,0.076163,66.614993,0.003289,0.287671,0.038356,-2.076472e-16
std,3.791351,11.68518,20.013889,0.290099,0.234463,9.309765,0.050211,0.452683,0.192057,105.9859
min,15.0059,8.8,6.0,0.0,0.0,42.4,0.0,0.0,0.0,-195.7965
25%,21.84282,43.55,47.75,29.84,0.0,60.4,0.0,0.0,0.0,-93.77721
50%,24.34449,53.8,63.25,29.985,0.0,66.6,0.0,0.0,0.0,4.146117
75%,26.85653,62.45,78.0,30.12,0.0,72.7,0.0,1.0,0.0,94.44932
max,40.49677,82.4,100.0,30.69,1.0,105.8,1.0,1.0,1.0,198.1669


Zmienne:
* rain
* thunder
* Day
* Holiday 

przyjmują wartości dyskretne (binarne).

In [102]:
BINARY_COLS = ['rain', 'thunder', 'Day', 'Holiday']
CONT_COLS = ['dewpti', 'hum', 'pressurei', 'tempi', 'PCA_1', 'month', 'day', 'hour', 'minute']

### Dodatkowe kolumny z czasem

In [103]:
df['month'] = pd.to_datetime(df['Date']).apply(lambda x: x.month)

In [104]:
df['day'] = pd.to_datetime(df['Date']).apply(lambda x: x.day)

In [105]:
df['hour'] = pd.to_datetime(df['Date']).apply(lambda x: x.hour)

In [106]:
df['minute'] = pd.to_datetime(df['Date']).apply(lambda x: x.minute)

### Normalizacja zmiennych ciągłych

In [107]:
from sklearn.preprocessing import StandardScaler

In [108]:
scaler = StandardScaler()

In [109]:
df[CONT_COLS] = scaler.fit_transform(df[CONT_COLS].values)

In [110]:
df.head()

Unnamed: 0,Date,VAR,dewpti,hum,pressurei,rain,tempi,thunder,Day,Holiday,PCA_1,month,day,hour,minute
0,2016-05-01 00:00:00,24.25751,0.81984,1.001124,-0.066908,0.0,0.095064,0.0,1,0,-0.14549,-0.442602,-1.673503,-1.661325,-1.341641
1,2016-05-01 00:15:00,25.38191,-0.245627,1.123958,-0.506223,0.0,0.069069,0.0,1,0,1.869774,-0.442602,-1.673503,-1.661325,-0.447214
2,2016-05-01 00:30:00,25.10542,0.845514,1.30092,-0.101379,0.0,-0.044577,0.0,1,0,0.986017,-0.442602,-1.673503,-1.661325,0.447214
3,2016-05-01 00:45:00,24.22065,0.832677,1.188497,-0.101379,0.0,-0.023094,0.0,1,0,1.127663,-0.442602,-1.673503,-1.661325,1.341641
4,2016-05-01 01:00:00,24.18378,0.81984,1.076073,-0.101379,0.0,-0.001611,0.0,1,0,1.269359,-0.442602,-1.673503,-1.516862,-1.341641


In [111]:
FEATURES = df.columns.drop(["Date","VAR"])
GROUND_TRUTH = ["VAR"]

### Przygotowanie danych do sieci rekurencyjnej

In [112]:
def prepare_recurrent_input(df, seq_size=3):
    '''
    
    A function to prepare sequences 
    for recurrent neural network
    
    '''
    df_ret = df.copy()
    
    pad = (seq_size - 1) / 2
    
    cols = df.columns.values.tolist()
    left_cols = []
    right_cols = []
    
    for i in range(pad):
        i += 1
        
        for col in df.columns:
            col_name = "{}_t-{}".format(col, i)
            df_ret[col_name] = df_ret[col].shift(i)
            left_cols.append(col_name)

        for col in df.columns:
            col_name = "{}_t+{}".format(col, i)
            df_ret[col_name] = df_ret[col].shift(-i)
            right_cols.append(col_name)
   
    columns = left_cols[::-1]+cols+right_cols     
    return df_ret[columns]

In [113]:
rec_df = prepare_recurrent_input(df[FEATURES], 5)

In [114]:
rec_df.dropna(inplace=True)

In [115]:
len(df)

35040

In [116]:
len(rec_df)

35036

In [117]:
rec_df.head()

Unnamed: 0,minute_t-2,hour_t-2,day_t-2,month_t-2,PCA_1_t-2,Holiday_t-2,Day_t-2,thunder_t-2,tempi_t-2,rain_t-2,...,rain_t+2,tempi_t+2,thunder_t+2,Day_t+2,Holiday_t+2,PCA_1_t+2,month_t+2,day_t+2,hour_t+2,minute_t+2
2,-1.341641,-1.661325,-1.673503,-0.442602,-0.14549,0.0,1.0,0.0,0.095064,0.0,...,0.0,-0.001611,0.0,1.0,0.0,1.269359,-0.442602,-1.673503,-1.516862,-1.341641
3,-0.447214,-1.661325,-1.673503,-0.442602,1.869774,0.0,1.0,0.0,0.069069,0.0,...,0.0,-0.023094,0.0,1.0,0.0,1.316278,-0.442602,-1.673503,-1.516862,-0.447214
4,0.447214,-1.661325,-1.673503,-0.442602,0.986017,0.0,1.0,0.0,-0.044577,0.0,...,0.0,-0.044577,0.0,1.0,0.0,1.363225,-0.442602,-1.673503,-1.516862,0.447214
5,1.341641,-1.661325,-1.673503,-0.442602,1.127663,0.0,1.0,0.0,-0.023094,0.0,...,0.0,-0.023094,0.0,1.0,0.0,1.221643,-0.442602,-1.673503,-1.516862,1.341641
6,-1.341641,-1.516862,-1.673503,-0.442602,1.269359,0.0,1.0,0.0,-0.001611,0.0,...,0.0,-0.001611,0.0,1.0,0.0,1.080067,-0.442602,-1.673503,-1.372399,-1.341641


### Przekształcanie wejścia

In [118]:
rec_df["VAR"] = df["VAR"].iloc[rec_df.index.values]

In [119]:
train_df = rec_df[~rec_df.VAR.isna()]

In [120]:
test_df = rec_df[rec_df.VAR.isna()]

In [121]:
#plt.figure(figsize=(20,10))
#plt.plot(train_df.VAR)

### Podział datasetu

In [122]:
idx = range(len(train_df))
len(idx)

34366

In [123]:
train_df.shape
231 / 21

11

In [124]:
dev_last_idx = int(0.8 * len(idx))

In [125]:
dev_idx = idx[:dev_last_idx]
dev_test_idx = idx[dev_last_idx:]

### Reshape

In [126]:
# Dane treningowe
X_train = train_df.iloc[:dev_last_idx][train_df.columns[:-1]].values
y_train = train_df.iloc[:dev_last_idx][train_df.columns[-1]].values

In [127]:
X_test = test_df.iloc[:dev_last_idx][test_df.columns[:-1]].values
#y_test = test_df.iloc[:dev_last_idx][test_df.columns[-1]].values

In [128]:
X_train.shape

(27492, 65)

In [129]:
X_train = X_train.reshape(X_train.shape[0], 13, -1).swapaxes(1, 2)
X_test = X_test.reshape(X_test.shape[0], 13, -1).swapaxes(1, 2)

In [130]:
#for c in train_df.columns:
#    print c

In [131]:
np.savez("data/rnn_input_s5.npz", X_train=X_train, X_test=X_test, y_train=y_train)

In [132]:
loaded = np.load("data/rnn_input_s5.npz")
loaded['X_train'].shape

(27492, 5, 13)