<a href="https://colab.research.google.com/github/harnalashok/deeplearning-sequences/blob/main/time_seriesWithEncoderDecoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 20th Sep, 2021
# Ref: https://www.angioi.com/time-series-encoder-decoder-tensorflow/

# Time Series Forecasting with an LSTM Encoder/Decoder in TensorFlow 2.0

In [1]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip


--2021-09-20 10:18:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279992 (273K) [application/x-httpd-php]
Saving to: ‘Bike-Sharing-Dataset.zip’


2021-09-20 10:18:31 (990 KB/s) - ‘Bike-Sharing-Dataset.zip’ saved [279992/279992]



In [40]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [8]:
! pwd      # /content
! ls -la

/content
total 296
drwxr-xr-x 1 root root   4096 Sep 20 10:21 .
drwxr-xr-x 1 root root   4096 Sep 20 10:14 ..
drwxr-xr-x 2 root root   4096 Sep 20 10:21 bike_data
-rw-r--r-- 1 root root 279992 Dec 20  2013 Bike-Sharing-Dataset.zip
drwxr-xr-x 4 root root   4096 Sep 16 13:39 .config
drwxr-xr-x 1 root root   4096 Sep 16 13:40 sample_data


In [7]:
! unzip Bike-Sharing-Dataset.zip -d bike_data

Archive:  Bike-Sharing-Dataset.zip
  inflating: bike_data/Readme.txt    
  inflating: bike_data/day.csv       
  inflating: bike_data/hour.csv      


In [10]:

df = pd.read_csv('bike_data/hour.csv', index_col='instant')


In [12]:
df.head()
df.columns

Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


Index(['dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [15]:
# Extracting only a subset of features
def select_columns(df):

    cols_to_keep = [
                     'cnt',
                     'temp',
                     'hum',
                     'windspeed',
                     'yr',
                     'mnth',
                     'hr',
                     'holiday',
                     'weekday',
                     'workingday'
                    ]

    df_subset = df[cols_to_keep]
    return df_subset
    


In [16]:
# Some of the integer features need to be onehot encoded;
# but not all of them
def onehot_encode_integers(df, excluded_cols):
    df = df.copy()

    int_cols = [col for col in df.select_dtypes(include=['int']) if col not in excluded_cols]
    df.loc[:, int_cols] = df.loc[:, int_cols].astype('str')

    df_encoded = pd.get_dummies(df)
    return df_encoded

In [17]:
# cnt will be target of regression, but also a feature:
# it needs to be normalized. This is not the correct way to do it,
# as it leads to information leakage from test to training set.
def normalize_cnt(df):
    df = df.copy()
    df['cnt'] = df['cnt'] / df['cnt'].max()
    return df
    

In [18]:
# I <3 pandas pipes
dataset = (df
           .pipe(select_columns)
           .pipe(onehot_encode_integers, excluded_cols=['cnt'])
           .pipe(normalize_cnt)
           )

In [19]:
dataset.head()

Unnamed: 0_level_0,cnt,temp,hum,windspeed,yr_0,yr_1,mnth_1,mnth_10,mnth_11,mnth_12,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,hr_0,hr_1,hr_10,hr_11,hr_12,hr_13,hr_14,hr_15,hr_16,hr_17,hr_18,hr_19,hr_2,hr_20,hr_21,hr_22,hr_23,hr_3,hr_4,hr_5,hr_6,hr_7,hr_8,hr_9,holiday_0,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1
1,0.016377,0.24,0.81,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
2,0.040942,0.22,0.8,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
3,0.032753,0.22,0.8,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
4,0.013306,0.24,0.75,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
5,0.001024,0.24,0.75,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0


In [20]:
def create_dataset(df, n_deterministic_features, window_size, forecast_size, batch_size):
    # Feel free to play with shuffle buffer size
    shuffle_buffer_size = len(df)
    # Total size of window is given by the number of steps to be considered
    # before prediction time + steps that we want to forecast
    total_size = window_size + forecast_size

    data = tf.data.Dataset.from_tensor_slices(df.values)

    # Selecting windows
    data = data.window(total_size, shift=1, drop_remainder=True)
    data = data.flat_map(lambda k: k.batch(total_size))

    # Shuffling data (seed=Answer to the Ultimate Question of Life, the Universe, and Everything)
    data = data.shuffle(shuffle_buffer_size, seed=42)

    # Extracting past features + deterministic future + labels
    data = data.map(lambda k: ((k[:-forecast_size],
                                k[-forecast_size:, -n_deterministic_features:]),
                               k[-forecast_size:, 0]))

    return data.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

Returns a dataset of "windows".

Each "window" is a dataset that contains a subset of elements of the input dataset. These are finite datasets of size size (or possibly fewer if there are not enough input elements to fill the window and drop_remainder evaluates to False).

See [here](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#window)
```
window(
    size, shift=None, stride=1, drop_remainder=False
)


```

In [23]:
dataset = tf.data.Dataset.range(7).window(3)


In [27]:
for window in dataset:
  print(window)

<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>


In [29]:
for window in dataset:
  print([item.numpy() for item in window])
  break;


[0, 1, 2]


Shift<br>
The shift argument determines the number of input elements to shift between the start of each window. If windows and elements are both numbered starting at 0, the first element in window k will be element k * shift of the input dataset. In particular, the first element of the first window will always be the first element of the input dataset.

In [32]:
dataset = tf.data.Dataset.range(7).window(3, shift=1, drop_remainder=True)
for window in dataset:
  print(list(window.as_numpy_iterator()))
  #break;

[0, 1, 2]
[1, 2, 3]
[2, 3, 4]
[3, 4, 5]
[4, 5, 6]


In [59]:
dx = pd.DataFrame({'a' : np.random.randint(2,9,20), 'b' : np.random.randint(10,20,20)})
dx

Unnamed: 0,a,b
0,3,13
1,3,14
2,6,13
3,3,10
4,6,14
5,6,11
6,5,16
7,8,19
8,5,18
9,6,11


In [61]:
data1 = tf.data.Dataset.from_tensor_slices(dx.values)

In [62]:
data1

<TensorSliceDataset shapes: (2,), types: tf.int64>

In [63]:
list(data1.as_numpy_iterator())

[array([ 3, 13]),
 array([ 3, 14]),
 array([ 6, 13]),
 array([ 3, 10]),
 array([ 6, 14]),
 array([ 6, 11]),
 array([ 5, 16]),
 array([ 8, 19]),
 array([ 5, 18]),
 array([ 6, 11]),
 array([ 5, 12]),
 array([ 6, 17]),
 array([ 3, 16]),
 array([ 3, 12]),
 array([ 4, 19]),
 array([ 6, 15]),
 array([ 7, 11]),
 array([ 3, 10]),
 array([ 5, 14]),
 array([ 5, 11])]

In [66]:
data = data1.window(3, shift=1, drop_remainder= False)

In [67]:
for window in data:
  print(list(window.as_numpy_iterator()))

[array([ 3, 13]), array([ 3, 14]), array([ 6, 13])]
[array([ 3, 14]), array([ 6, 13]), array([ 3, 10])]
[array([ 6, 13]), array([ 3, 10]), array([ 6, 14])]
[array([ 3, 10]), array([ 6, 14]), array([ 6, 11])]
[array([ 6, 14]), array([ 6, 11]), array([ 5, 16])]
[array([ 6, 11]), array([ 5, 16]), array([ 8, 19])]
[array([ 5, 16]), array([ 8, 19]), array([ 5, 18])]
[array([ 8, 19]), array([ 5, 18]), array([ 6, 11])]
[array([ 5, 18]), array([ 6, 11]), array([ 5, 12])]
[array([ 6, 11]), array([ 5, 12]), array([ 6, 17])]
[array([ 5, 12]), array([ 6, 17]), array([ 3, 16])]
[array([ 6, 17]), array([ 3, 16]), array([ 3, 12])]
[array([ 3, 16]), array([ 3, 12]), array([ 4, 19])]
[array([ 3, 12]), array([ 4, 19]), array([ 6, 15])]
[array([ 4, 19]), array([ 6, 15]), array([ 7, 11])]
[array([ 6, 15]), array([ 7, 11]), array([ 3, 10])]
[array([ 7, 11]), array([ 3, 10]), array([ 5, 14])]
[array([ 3, 10]), array([ 5, 14]), array([ 5, 11])]
[array([ 5, 14]), array([ 5, 11])]
[array([ 5, 11])]


if there are not enough input elements to fill the window and drop_remainder evaluates to True, these will be dropped 

In [68]:
data = data1.window(3, shift=1, drop_remainder= True)

In [69]:
for window in data:
  print(list(window.as_numpy_iterator()))

[array([ 3, 13]), array([ 3, 14]), array([ 6, 13])]
[array([ 3, 14]), array([ 6, 13]), array([ 3, 10])]
[array([ 6, 13]), array([ 3, 10]), array([ 6, 14])]
[array([ 3, 10]), array([ 6, 14]), array([ 6, 11])]
[array([ 6, 14]), array([ 6, 11]), array([ 5, 16])]
[array([ 6, 11]), array([ 5, 16]), array([ 8, 19])]
[array([ 5, 16]), array([ 8, 19]), array([ 5, 18])]
[array([ 8, 19]), array([ 5, 18]), array([ 6, 11])]
[array([ 5, 18]), array([ 6, 11]), array([ 5, 12])]
[array([ 6, 11]), array([ 5, 12]), array([ 6, 17])]
[array([ 5, 12]), array([ 6, 17]), array([ 3, 16])]
[array([ 6, 17]), array([ 3, 16]), array([ 3, 12])]
[array([ 3, 16]), array([ 3, 12]), array([ 4, 19])]
[array([ 3, 12]), array([ 4, 19]), array([ 6, 15])]
[array([ 4, 19]), array([ 6, 15]), array([ 7, 11])]
[array([ 6, 15]), array([ 7, 11]), array([ 3, 10])]
[array([ 7, 11]), array([ 3, 10]), array([ 5, 14])]
[array([ 3, 10]), array([ 5, 14]), array([ 5, 11])]


In [70]:
data = data1.window(3, shift=None, drop_remainder= False)

In [71]:
for window in data:
  print(list(window.as_numpy_iterator()))

[array([ 3, 13]), array([ 3, 14]), array([ 6, 13])]
[array([ 3, 10]), array([ 6, 14]), array([ 6, 11])]
[array([ 5, 16]), array([ 8, 19]), array([ 5, 18])]
[array([ 6, 11]), array([ 5, 12]), array([ 6, 17])]
[array([ 3, 16]), array([ 3, 12]), array([ 4, 19])]
[array([ 6, 15]), array([ 7, 11]), array([ 3, 10])]
[array([ 5, 14]), array([ 5, 11])]


if there are not enough input elements to fill the window and drop_remainder evaluates to True, these will be dropped 

In [72]:
data = data1.window(3, shift=None, drop_remainder= True)
for window in data:
  print(list(window.as_numpy_iterator()))

[array([ 3, 13]), array([ 3, 14]), array([ 6, 13])]
[array([ 3, 10]), array([ 6, 14]), array([ 6, 11])]
[array([ 5, 16]), array([ 8, 19]), array([ 5, 18])]
[array([ 6, 11]), array([ 5, 12]), array([ 6, 17])]
[array([ 3, 16]), array([ 3, 12]), array([ 4, 19])]
[array([ 6, 15]), array([ 7, 11]), array([ 3, 10])]


In [73]:
data = data.flat_map(lambda k: k.batch(3))

In [77]:
print(list(data.as_numpy_iterator()))

[array([[ 3, 13],
       [ 3, 14],
       [ 6, 13]]), array([[ 3, 10],
       [ 6, 14],
       [ 6, 11]]), array([[ 5, 16],
       [ 8, 19],
       [ 5, 18]]), array([[ 6, 11],
       [ 5, 12],
       [ 6, 17]]), array([[ 3, 16],
       [ 3, 12],
       [ 4, 19]]), array([[ 6, 15],
       [ 7, 11],
       [ 3, 10]])]


In [78]:
shuffle_buffer_size = len(dx)

In [79]:
data = data.shuffle(shuffle_buffer_size, seed=42)

In [80]:
print(list(data.as_numpy_iterator()))

[array([[ 6, 11],
       [ 5, 12],
       [ 6, 17]]), array([[ 5, 16],
       [ 8, 19],
       [ 5, 18]]), array([[ 3, 10],
       [ 6, 14],
       [ 6, 11]]), array([[ 3, 16],
       [ 3, 12],
       [ 4, 19]]), array([[ 6, 15],
       [ 7, 11],
       [ 3, 10]]), array([[ 3, 13],
       [ 3, 14],
       [ 6, 13]])]
