##WindowGenerator

Windowing calculator - adapted from TensorFlow tutorial at:

https://www.tensorflow.org/tutorials/structured_data/time_series

Defines a windowing tool which can be applied to any given 2D DataFrame with the configured label column(s). Supports single or multi-label targets.

Transforms input into a time-stepped dataset prepared for supervised learning analysis.

Outputs a 3D dataset:  (batch, time, features)

where `time` is data per timestep, i.e. if training on a 1-year monthly lookback, then the `time` dimension should be 12.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.data import Dataset

class TfWindowGenerator():
  """
  Construct a TfWindowGenerator that operates with the following params:
  * input_width - the number of time steps to use as the input window
  * label_width - the number of time steps to use as output
  * shift - offset that places the output prediction along the window; equal to
  or greater than the label_width
  * label_columns - name(s) of the label columns
  """
  def __init__(self, input_width, label_width, shift=1, batch_size=32, debug=False):
    # GUARDs
    if (input_width is None or label_width is None):
      raise AssertionError('input_width and label_width are required')

    self.debug = debug

    # Assess window parameters
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift
    self.batch_size = batch_size

    # Total number of data rows to extract per window/frame
    self.total_window_size = input_width + label_width
    # Index within a window/frame to look for the label(s)
    self.label_start = input_width + shift - 1


  def __repr__(self):
    """
    Print object stats.
    """
    return '\n'.join([
        'Window Generator::',
        f'\tInput size: {self.input_width}',
        f'\tLabel size: {self.label_width}',
        f'\tShift: {self.shift}',
        f'\tBatch size: {self.batch_size}',
        f'\tTotal window size: {self.total_window_size}',
        f'\tLabel start: {self.label_start}',
        ])


  def generate(self, df, label_cols, retain_labels=False):
    """
    Given a feature set of normalized values, create a windowed stack of
    inputs with output labels split out.

    If retain_labels is set, do not drop labels from features.

    Returns (inputs, outputs).
    """


    df_labels = df[LAB_COLS]
    df_features = df.drop(columns=LAB_COLS)

    return self.generate_from_arrays(df_features.values, df_labels.values)


  def generate_from_arrays(self, input_arr, label_arr):

    # Restack data into frames the size of our total window
    # NOTE this currently retains all target columns
    LAST_WINDOW_START = input_arr.shape[0]-self.total_window_size
    NUM_FRAMES = LAST_WINDOW_START+1

    input_arr = np.asarray(input_arr).astype('float32')
    label_arr = np.asarray(label_arr).astype('float32')
    print(f'input_arr: {input_arr.shape}')
    print(f'label_arr: {label_arr.shape}')

    frames = []
    labels = []

    #   Apply for each row
    for r in range(0, NUM_FRAMES):
      frames.append(input_arr[r:r+self.input_width])
      labels.append(label_arr[self.label_start+r:self.label_start+r+self.label_width])


    frames = np.asarray(frames).astype('float32')
    labels = np.asarray(labels)

    if (self.debug):
      print (f'Frames: {frames.shape}')
      print (f'First frame:\n{frames[0]}')
      print (f'Last frame:\n{frames[-1]}')
      print (f'Labels: {labels.shape}')
      print (f'First label:\n{labels[0]}')
      print (f'Last label:\n{labels[-1]}')


    if (self.debug):
      print('\n'.join([
          'Returned shapes::',
          f'\tInputs: {frames.shape}',
          f'\tLabels: {labels.shape}'
      ]))

    return frames, labels

  def get_dataset(self, df, label_cols):
    inputs, labels = self.generate(df, label_cols)
    print(f'## Inputs: {type(inputs)}')
    print(f'## Labels: {type(labels)}')

    inputs = tf.data.Dataset.from_tensor_slices(inputs)
    labels = tf.data.Dataset.from_tensor_slices(labels)

    ds =  Dataset.zip(inputs, labels)
    ds = ds.batch(self.batch_size)
    return ds

  def get_ds_from_arrays(self, input_arr, label_arr):
    inputs, labels = self.generate_from_arrays(input_arr, label_arr)
    print(f'## Inputs: {inputs.shape}')
    print(f'## Labels: {labels.shape}')

    if (len(labels.shape) < 2):
      labels = labels.reshape(-1,1)

    inputs = tf.data.Dataset.from_tensor_slices(inputs)
    labels = tf.data.Dataset.from_tensor_slices(labels)

    ds =  Dataset.zip(inputs, labels)
    ds = ds.batch(self.batch_size)
    return ds


---

**Unit testing**

---

In [2]:
WG_UNIT_TEST = False

In [3]:
if WG_UNIT_TEST:
  from datetime import datetime as dt
  import datetime
  import pandas as pd

  a = []
  b = []
  c = []
  idx = []
  NUM_PTS = 22
  date_start = dt.strptime("1/1/11", "%m/%d/%y")

  for t in range(1,NUM_PTS+1):
    # just append some letter to t
    a.append(chr(96+t))
    b.append(t)
    c.append(t * 0.125)
    idx.append(date_start + datetime.timedelta(days=t))

  df = pd.DataFrame({'A':a,
                    'B': b,
                    'C':c},
                  index=idx)

  #print (df)

  # Label encode our target vals - going to use plain old py char vals
  df['A'] = df['A'].apply(lambda x: ord(x))

  # Scale values
  #df_mean = df.mean()
  #df_std = df.std()
  #df = (df - df_mean) / df_std

  print (df)
  print(f'Shape: {df.shape}')

              A   B      C
2011-01-02   97   1  0.125
2011-01-03   98   2  0.250
2011-01-04   99   3  0.375
2011-01-05  100   4  0.500
2011-01-06  101   5  0.625
2011-01-07  102   6  0.750
2011-01-08  103   7  0.875
2011-01-09  104   8  1.000
2011-01-10  105   9  1.125
2011-01-11  106  10  1.250
2011-01-12  107  11  1.375
2011-01-13  108  12  1.500
2011-01-14  109  13  1.625
2011-01-15  110  14  1.750
2011-01-16  111  15  1.875
2011-01-17  112  16  2.000
2011-01-18  113  17  2.125
2011-01-19  114  18  2.250
2011-01-20  115  19  2.375
2011-01-21  116  20  2.500
2011-01-22  117  21  2.625
2011-01-23  118  22  2.750
Shape: (22, 3)


In [4]:
if WG_UNIT_TEST:
  # Case 1: FAIL: Create failed windower - raises AssertionError
  IN_WIDTH=None
  LAB_WIDTH=1
  SHIFT=1
  LAB_COLS=['A']
  print('--- Case 1 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  try:
    win = TfWindowGenerator(input_width=None, label_width=2)
  except AssertionError:
    print ('Correct outcome - assert error')

--- Case 1 ------
 input_width=None, label_width=1, shift=1, label_columns=['A']
Correct outcome - assert error


In [5]:
if WG_UNIT_TEST:
  # Case 2: Create windower for single label timeframe, single label column
  IN_WIDTH=4
  LAB_WIDTH=1
  SHIFT=1
  LAB_COLS=['A']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4, debug=True)
  print('--- Case 2 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  # Split X/y
  inputs, labels = win.generate(df, LAB_COLS)

  print(f'Generated inputs: {inputs.shape} , labels: {labels.shape}')
  for batch in zip(inputs,labels):
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break

--- Case 2 ------
 input_width=4, label_width=1, shift=1, label_columns=['A']
Window Generator::
	Input size: 4
	Label size: 1
	Shift: 1
	Batch size: 4
	Total window size: 5
	Label start: 4
input_arr: (22, 2)
label_arr: (22, 1)
Frames: (18, 4, 2)
First frame:
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
Last frame:
[[18.     2.25 ]
 [19.     2.375]
 [20.     2.5  ]
 [21.     2.625]]
Labels: (18, 1, 1)
First label:
[[101.]]
Last label:
[[118.]]
Returned shapes::
	Inputs: (18, 4, 2)
	Labels: (18, 1, 1)
Generated inputs: (18, 4, 2) , labels: (18, 1, 1)
-----X-----------
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
-----y-----------
[[101.]]


In [6]:
if WG_UNIT_TEST:
  # Case 3 Create dataset, single label column
  IN_WIDTH=4
  LAB_WIDTH=1
  SHIFT=1
  LAB_COLS=['A']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4, debug=True)
  print('--- Case 3 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  print('Making/iterating dataset...')
  ds = win.get_dataset(df, LAB_COLS)
  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break

--- Case 3 ------
 input_width=4, label_width=1, shift=1, label_columns=['A']
Window Generator::
	Input size: 4
	Label size: 1
	Shift: 1
	Batch size: 4
	Total window size: 5
	Label start: 4
Making/iterating dataset...
input_arr: (22, 2)
label_arr: (22, 1)
Frames: (18, 4, 2)
First frame:
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
Last frame:
[[18.     2.25 ]
 [19.     2.375]
 [20.     2.5  ]
 [21.     2.625]]
Labels: (18, 1, 1)
First label:
[[101.]]
Last label:
[[118.]]
Returned shapes::
	Inputs: (18, 4, 2)
	Labels: (18, 1, 1)
## Inputs: <class 'numpy.ndarray'>
## Labels: <class 'numpy.ndarray'>
-----X-----------
[[[1.    0.125]
  [2.    0.25 ]
  [3.    0.375]
  [4.    0.5  ]]

 [[2.    0.25 ]
  [3.    0.375]
  [4.    0.5  ]
  [5.    0.625]]

 [[3.    0.375]
  [4.    0.5  ]
  [5.    0.625]
  [6.    0.75 ]]

 [[4.    0.5  ]
  [5.    0.625]
  [6.    0.75 ]
  [7.    0.875]]]
-----y-----------
[[[101.]]

 [[102.]]

 [[103.]]

 [[104.]]]


In [7]:
if WG_UNIT_TEST:
  # Case 4 Create dataset, multi label columns
  IN_WIDTH=4
  LAB_WIDTH=2
  SHIFT=1
  LAB_COLS=['A','B']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4,debug=True)
  print('--- Case 4 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  print('Making/iterating dataset...')
  ds = win.get_dataset(df, LAB_COLS)
  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break

--- Case 4 ------
 input_width=4, label_width=2, shift=1, label_columns=['A', 'B']
Window Generator::
	Input size: 4
	Label size: 2
	Shift: 1
	Batch size: 4
	Total window size: 6
	Label start: 4
Making/iterating dataset...
input_arr: (22, 1)
label_arr: (22, 2)
Frames: (17, 4, 1)
First frame:
[[0.125]
 [0.25 ]
 [0.375]
 [0.5  ]]
Last frame:
[[2.125]
 [2.25 ]
 [2.375]
 [2.5  ]]
Labels: (17, 2, 2)
First label:
[[101.   5.]
 [102.   6.]]
Last label:
[[117.  21.]
 [118.  22.]]
Returned shapes::
	Inputs: (17, 4, 1)
	Labels: (17, 2, 2)
## Inputs: <class 'numpy.ndarray'>
## Labels: <class 'numpy.ndarray'>
-----X-----------
[[[0.125]
  [0.25 ]
  [0.375]
  [0.5  ]]

 [[0.25 ]
  [0.375]
  [0.5  ]
  [0.625]]

 [[0.375]
  [0.5  ]
  [0.625]
  [0.75 ]]

 [[0.5  ]
  [0.625]
  [0.75 ]
  [0.875]]]
-----y-----------
[[[101.   5.]
  [102.   6.]]

 [[102.   6.]
  [103.   7.]]

 [[103.   7.]
  [104.   8.]]

 [[104.   8.]
  [105.   9.]]]


In [8]:
if WG_UNIT_TEST:
  # Case 5 Get windows input/outputs, multi label columns
  IN_WIDTH=4
  LAB_WIDTH=2
  SHIFT=1
  LAB_COLS=['A']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4, debug=True)
  print('--- Case 5 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  print('Making/iterating dataset...')
  ins,labs = win.generate(df, LAB_COLS)
  for batch in zip(ins,labs):
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break


--- Case 5 ------
 input_width=4, label_width=2, shift=1, label_columns=['A']
Window Generator::
	Input size: 4
	Label size: 2
	Shift: 1
	Batch size: 4
	Total window size: 6
	Label start: 4
Making/iterating dataset...
input_arr: (22, 2)
label_arr: (22, 1)
Frames: (17, 4, 2)
First frame:
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
Last frame:
[[17.     2.125]
 [18.     2.25 ]
 [19.     2.375]
 [20.     2.5  ]]
Labels: (17, 2, 1)
First label:
[[101.]
 [102.]]
Last label:
[[117.]
 [118.]]
Returned shapes::
	Inputs: (17, 4, 2)
	Labels: (17, 2, 1)
-----X-----------
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
-----y-----------
[[101.]
 [102.]]


In [9]:
if WG_UNIT_TEST:
  # Case 6 Get windows input/outputs, single label, multi label step
  IN_WIDTH=4
  LAB_WIDTH=4
  SHIFT=1
  LAB_COLS=['A']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4,debug=True)
  print('--- Case 6 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  print('Making/iterating dataset...')
  ins,labs = win.generate(df, LAB_COLS)
  for batch in zip(ins,labs):
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break

--- Case 6 ------
 input_width=4, label_width=4, shift=1, label_columns=['A']
Window Generator::
	Input size: 4
	Label size: 4
	Shift: 1
	Batch size: 4
	Total window size: 8
	Label start: 4
Making/iterating dataset...
input_arr: (22, 2)
label_arr: (22, 1)
Frames: (15, 4, 2)
First frame:
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
Last frame:
[[15.     1.875]
 [16.     2.   ]
 [17.     2.125]
 [18.     2.25 ]]
Labels: (15, 4, 1)
First label:
[[101.]
 [102.]
 [103.]
 [104.]]
Last label:
[[115.]
 [116.]
 [117.]
 [118.]]
Returned shapes::
	Inputs: (15, 4, 2)
	Labels: (15, 4, 1)
-----X-----------
[[1.    0.125]
 [2.    0.25 ]
 [3.    0.375]
 [4.    0.5  ]]
-----y-----------
[[101.]
 [102.]
 [103.]
 [104.]]


In [10]:
if WG_UNIT_TEST:
  # Case 7 Get windows input/outputs, multi label columns
  IN_WIDTH=4
  LAB_WIDTH=2
  SHIFT=1
  LAB_COLS=['A','B']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4,debug=True)
  print('--- Case 7 ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  print('Making/iterating dataset...')
  ds = win.get_dataset(df, LAB_COLS)
  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break


--- Case 7 ------
 input_width=4, label_width=2, shift=1, label_columns=['A', 'B']
Window Generator::
	Input size: 4
	Label size: 2
	Shift: 1
	Batch size: 4
	Total window size: 6
	Label start: 4
Making/iterating dataset...
input_arr: (22, 1)
label_arr: (22, 2)
Frames: (17, 4, 1)
First frame:
[[0.125]
 [0.25 ]
 [0.375]
 [0.5  ]]
Last frame:
[[2.125]
 [2.25 ]
 [2.375]
 [2.5  ]]
Labels: (17, 2, 2)
First label:
[[101.   5.]
 [102.   6.]]
Last label:
[[117.  21.]
 [118.  22.]]
Returned shapes::
	Inputs: (17, 4, 1)
	Labels: (17, 2, 2)
## Inputs: <class 'numpy.ndarray'>
## Labels: <class 'numpy.ndarray'>
-----X-----------
[[[0.125]
  [0.25 ]
  [0.375]
  [0.5  ]]

 [[0.25 ]
  [0.375]
  [0.5  ]
  [0.625]]

 [[0.375]
  [0.5  ]
  [0.625]
  [0.75 ]]

 [[0.5  ]
  [0.625]
  [0.75 ]
  [0.875]]]
-----y-----------
[[[101.   5.]
  [102.   6.]]

 [[102.   6.]
  [103.   7.]]

 [[103.   7.]
  [104.   8.]]

 [[104.   8.]
  [105.   9.]]]


In [12]:
df

Unnamed: 0,A,B,C
2011-01-02,97,1,0.125
2011-01-03,98,2,0.25
2011-01-04,99,3,0.375
2011-01-05,100,4,0.5
2011-01-06,101,5,0.625
2011-01-07,102,6,0.75
2011-01-08,103,7,0.875
2011-01-09,104,8,1.0
2011-01-10,105,9,1.125
2011-01-11,106,10,1.25


In [15]:
if WG_UNIT_TEST:
  # Case 8a Get ds from arrays, single label columns
  IN_WIDTH=4
  LAB_WIDTH=1
  SHIFT=1
  LAB_COLS=['C']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4,debug=True)
  print('--- Case 8a ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  val_arr = df[['A','B']].values
  lab_arr = df[['C']].values

  lab_arr = lab_arr.reshape(-1,1)

  print('Get ds from arrays...')
  ds = win.get_ds_from_arrays(val_arr, lab_arr)
  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break

--- Case 8a ------
 input_width=4, label_width=1, shift=1, label_columns=['C']
Window Generator::
	Input size: 4
	Label size: 1
	Shift: 1
	Batch size: 4
	Total window size: 5
	Label start: 4
Get ds from arrays...
input_arr: (22, 2)
label_arr: (22, 1)
Frames: (18, 4, 2)
First frame:
[[ 97.   1.]
 [ 98.   2.]
 [ 99.   3.]
 [100.   4.]]
Last frame:
[[114.  18.]
 [115.  19.]
 [116.  20.]
 [117.  21.]]
Labels: (18, 1, 1)
First label:
[[0.625]]
Last label:
[[2.75]]
Returned shapes::
	Inputs: (18, 4, 2)
	Labels: (18, 1, 1)
## Inputs: (18, 4, 2)
## Labels: (18, 1, 1)
-----X-----------
[[[ 97.   1.]
  [ 98.   2.]
  [ 99.   3.]
  [100.   4.]]

 [[ 98.   2.]
  [ 99.   3.]
  [100.   4.]
  [101.   5.]]

 [[ 99.   3.]
  [100.   4.]
  [101.   5.]
  [102.   6.]]

 [[100.   4.]
  [101.   5.]
  [102.   6.]
  [103.   7.]]]
-----y-----------
[[[0.625]]

 [[0.75 ]]

 [[0.875]]

 [[1.   ]]]


In [13]:
if WG_UNIT_TEST:
  # Case 8b Get ds from arrays, multi label columns
  IN_WIDTH=4
  LAB_WIDTH=2
  SHIFT=1
  LAB_COLS=['A','C']
  win = TfWindowGenerator(input_width=IN_WIDTH, label_width=LAB_WIDTH, shift=SHIFT, batch_size=4,debug=True)
  print('--- Case 8b ------\n', f'input_width={IN_WIDTH}, label_width={LAB_WIDTH}, shift={SHIFT}, label_columns={LAB_COLS}')
  print(win)

  val_arr = df[['A','B']].values
  lab_arr = df[['A','C']].values

  val_arr.shape

  print('Get ds from arrays...')
  ds = win.get_ds_from_arrays(val_arr, lab_arr)
  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')
    break


--- Case 8 ------
 input_width=4, label_width=2, shift=1, label_columns=['A', 'C']
Window Generator::
	Input size: 4
	Label size: 2
	Shift: 1
	Batch size: 4
	Total window size: 6
	Label start: 4
Get ds from arrays...
input_arr: (22, 2)
label_arr: (22, 2)
Frames: (17, 4, 2)
First frame:
[[ 97.   1.]
 [ 98.   2.]
 [ 99.   3.]
 [100.   4.]]
Last frame:
[[113.  17.]
 [114.  18.]
 [115.  19.]
 [116.  20.]]
Labels: (17, 2, 2)
First label:
[[101.      0.625]
 [102.      0.75 ]]
Last label:
[[117.      2.625]
 [118.      2.75 ]]
Returned shapes::
	Inputs: (17, 4, 2)
	Labels: (17, 2, 2)
## Inputs: (17, 4, 2)
## Labels: (17, 2, 2)
-----X-----------
[[[ 97.   1.]
  [ 98.   2.]
  [ 99.   3.]
  [100.   4.]]

 [[ 98.   2.]
  [ 99.   3.]
  [100.   4.]
  [101.   5.]]

 [[ 99.   3.]
  [100.   4.]
  [101.   5.]
  [102.   6.]]

 [[100.   4.]
  [101.   5.]
  [102.   6.]
  [103.   7.]]]
-----y-----------
[[[101.      0.625]
  [102.      0.75 ]]

 [[102.      0.75 ]
  [103.      0.875]]

 [[103.      0.875]