##WindowGenerator

Windowing calculator - adapted from TensorFlow tutorial at:

https://www.tensorflow.org/tutorials/structured_data/time_series

Defines a windowing tool which can be applied to any given 2D DataFrame with the configured label column(s). Supports single or multi-label targets.

Transforms input into a time-stepped dataset prepared for supervised learning analysis.

In [None]:
import numpy as np
import tensorflow as tf

class TfWindowGenerator():
  """
  Construct a TfWindowGenerator that operates with the following params:
  * input_width - the number of time steps to use as the input window
  * label_width - the number of time steps to use as output
  * shift - offset that places the output prediction along the window; equal to
  or greater than the label_width
  * label_columns - name(s) of the label columns
  """
  def __init__(self, input_width, label_width, shift,
               label_columns, debug=False):
    # GUARDs
    if label_columns is None:
      raise AssertionError('label_columns must be specified')
    #if shift < label_width:
    #  raise AssertionError('Shift must contain label_width (must be <=)')

    self.debug = debug

    # Defines expected labels in the dataset
    self.label_columns = label_columns

    # Assess window parameters
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift
    # Total number of data rows to extract per window/frame
    self.total_window_size = input_width + shift
    # Index within a window/frame to look for the label(s)
    self.label_start = self.total_window_size - self.label_width

    # Build slicers which will operate on stacked time-frames
    self.input_slice = slice(0, input_width)
    self.labels_slice = slice(self.label_start, self.total_window_size)

    # Indexes into window/frame
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    if (debug):
      print('\n'.join([
         f'input_slice: {self.input_slice}',
         f'labels_slice: {self.labels_slice}',
         f'input_indices: {self.input_indices}',
         f'label_indices: {self.label_indices}'
      ]))

  def __repr__(self):
    """
    Print object stats.
    """
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

  def generate(self, features):
    """
    Given a feature set of normalized values, create a windowed stack of
    inputs with output labels split out.

    Returns (inputs, outputs).
    """
    # Restack data into frames the size of our total window
    # NOTE this currently retains all target columns
    WINDOW_SIZE = self.total_window_size
    LAST_WINDOW_START = features.shape[0]-self.total_window_size
    NUM_FRAMES = LAST_WINDOW_START+1

    frames = []
    #   Apply for each row
    for r in range(0, NUM_FRAMES):
      frames.append(features[r:r+WINDOW_SIZE])

    frames = np.array(frames)

    if (self.debug):
      print (f'Frames: {frames.shape}')
      print (f'First frame: {frames[0]}')
      print (f'Last frame: {frames[-1]}')

    #inputs = features[self.input_slice, :]

    # Extract a tensor of stack of input frames
    #   NOTE This still includes labels
    if (self.debug):
      print(f'Slicing inputs with: {self.input_slice}')
    inputs = frames[:, self.input_slice]

    # Extract a list of label frames
    #TODO how do I subselect the right y column(s) out of this?
    if (self.debug):
      print(f'Slicing labels with: {self.labels_slice}')
    labels = frames[:, self.labels_slice]
    #ret_labels = labels

    if (self.debug):
      print(f'--- Inputs ---\n{inputs.shape}')
      print(f'--- Labels ---\n{labels.shape}')

    # Define all column indices
    if (self.debug):
      print(f'enumerating columns: {features.columns}')
    column_indices = {name: i for i, name in enumerate(features.columns)}
    #label_columns_indices = {name: i for i, name in enumerate(self.label_columns)}
    if (self.debug):
      print(f'Column indices: {column_indices}')

    # Create tf tensor from inputs
    #for name in self.label_columns:
    #  print(f'Col indices: {name}:: {column_indices[name]}')
    #  print(f'labels: {name}::\n {labels[:,:,column_indices[name]]}')

    #only_labels = [labels[:,:,column_indices[name]] for name in self.label_columns]
    #only_labels = np.array(only_labels)
    #print(f'only_labels.shape: {only_labels.shape}')
    #print(only_labels)

    # Locate the label columns, for extraction
    label_indices = [column_indices[name] for name in self.label_columns]
    # Extract labels! Keeping rest of the shape
    labels = np.take(labels, label_indices, axis=2)

    #labels = tf.stack(only_labels)

    #arr = [labels[:,:,column_indices[name]] for name in self.label_columns]
    #arr = np.array(arr)
    #print(f'arr.shape: {arr.shape}')

    #if (self.debug):
    #  print('\n'.join([
    #      'Returned shapes, before re-shape::',
    #      f'\tInputs: {inputs.shape}',
    #      f'\tLabels: {labels.shape}'
    #  ]))

    # Slicing doesn't preserve static shape information, so set the shapes
    # manually. This way the `tf.data.Datasets` are easier to inspect.
    #inputs.reshape([-1, self.input_width, None])
    #labels.reshape([-1, self.label_width])
    #tf.reshape(labels, (NUM_FRAMES, -1, self.label_width))

    if (self.debug):
      print('\n'.join([
          'Returned shapes::',
          f'\tInputs: {inputs.shape}',
          f'\tLabels: {labels.shape}'
      ]))

    # Store the last dataset
    self.inputs = inputs
    self.labels = labels

    return inputs, labels

  def get_results(self):
    return self.inputs, self.labels

  def make_dataset(self, inputs, labels, df=None):
    """
    Create a TF dataset from the given df.
    Example:  train_ds = make_dataset(train_df)
    """
    if (df is not None):
      data = np.array(data, labels, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=inputs,
        targets=labels,
        sequence_length=self.input_width,
        sequence_stride=1,
        shuffle=False,
        batch_size=32,)

    return ds


## Unit testing

In [None]:
if True:
  from datetime import datetime as dt
  import datetime
  import pandas as pd

  a = []
  b = []
  c = []
  idx = []
  NUM_PTS = 11
  date_start = dt.strptime("1/1/11", "%m/%d/%y")

  for t in range(1,NUM_PTS+1):
    a.append(chr(96+t))
    b.append(t)
    c.append(t * 0.125)
    idx.append(date_start + datetime.timedelta(days=t))

  df = pd.DataFrame({'A':a,
                    'B': b,
                    'C':c},
                  index=idx)

  #print (df)

  # Label encode our target vals - going to use plain old py char vals
  df['A'] = df['A'].apply(lambda x: ord(x))

  # Scale values
  #df_mean = df.mean()
  #df_std = df.std()
  #df = (df - df_mean) / df_std

  print (df)
  print(f'Shape: {df.shape}')

  # Case 1: FAIL: Create failed windower - raises AssertionError
  #win = TfWindowGenerator(input_width=4, label_width=1, shift=1)

  # Case 2: Create windower for single label timeframe, single label column
  win = TfWindowGenerator(input_width=4, label_width=1, shift=1, label_columns=['A'], debug=True)
  print('---------\n', win)

  # Split X/y
  inputs, labels = win.generate(df)

  print(inputs.shape)
  print(labels.shape)
  print(labels)

  #input_arr = np.array([i for i in inputs])
  #label_arr = np.array([l for l in labels])

  ds = win.make_dataset(inputs, labels)

  for batch in ds:
    X, y = batch
    print(f'-----X-----------\n{X}')
    print(f'-----y-----------\n{y}')


              A   B      C
2011-01-02   97   1  0.125
2011-01-03   98   2  0.250
2011-01-04   99   3  0.375
2011-01-05  100   4  0.500
2011-01-06  101   5  0.625
2011-01-07  102   6  0.750
2011-01-08  103   7  0.875
2011-01-09  104   8  1.000
2011-01-10  105   9  1.125
2011-01-11  106  10  1.250
2011-01-12  107  11  1.375
Shape: (11, 3)
input_slice: slice(0, 4, None)
labels_slice: slice(4, 5, None)
input_indices: [0 1 2 3]
label_indices: [4]
---------
 Total window size: 5
Input indices: [0 1 2 3]
Label indices: [4]
Label column name(s): ['A']
Frames: (7, 5, 3)
First frame: [[ 97.      1.      0.125]
 [ 98.      2.      0.25 ]
 [ 99.      3.      0.375]
 [100.      4.      0.5  ]
 [101.      5.      0.625]]
Last frame: [[103.      7.      0.875]
 [104.      8.      1.   ]
 [105.      9.      1.125]
 [106.     10.      1.25 ]
 [107.     11.      1.375]]
Slicing inputs with: slice(0, 4, None)
Slicing labels with: slice(4, 5, None)
--- Inputs ---
(7, 4, 3)
--- Labels ---
(7, 1, 3)
enumera

In [None]:
df

Unnamed: 0,A,B,C
2011-01-02,97,1,0.125
2011-01-03,98,2,0.25
2011-01-04,99,3,0.375
2011-01-05,100,4,0.5
2011-01-06,101,5,0.625
2011-01-07,102,6,0.75
2011-01-08,103,7,0.875
2011-01-09,104,8,1.0
2011-01-10,105,9,1.125
2011-01-11,106,10,1.25


In [None]:
y = np.asarray(df['A'])

In [None]:
X = np.asarray(df.drop(columns=['A']))

In [None]:
ds = tf.keras.utils.timeseries_dataset_from_array(
    data=X,
    targets=y,
    sequence_length=5)


In [None]:
for batch in ds:
  X_b, y_b = batch
  print(f'-----X_b-----------\n{X_b}')
  print(f'-----y_b-----------\n{y_b}')


-----X_b-----------
[[[ 1.     0.125]
  [ 2.     0.25 ]
  [ 3.     0.375]
  [ 4.     0.5  ]
  [ 5.     0.625]]

 [[ 2.     0.25 ]
  [ 3.     0.375]
  [ 4.     0.5  ]
  [ 5.     0.625]
  [ 6.     0.75 ]]

 [[ 3.     0.375]
  [ 4.     0.5  ]
  [ 5.     0.625]
  [ 6.     0.75 ]
  [ 7.     0.875]]

 [[ 4.     0.5  ]
  [ 5.     0.625]
  [ 6.     0.75 ]
  [ 7.     0.875]
  [ 8.     1.   ]]

 [[ 5.     0.625]
  [ 6.     0.75 ]
  [ 7.     0.875]
  [ 8.     1.   ]
  [ 9.     1.125]]

 [[ 6.     0.75 ]
  [ 7.     0.875]
  [ 8.     1.   ]
  [ 9.     1.125]
  [10.     1.25 ]]

 [[ 7.     0.875]
  [ 8.     1.   ]
  [ 9.     1.125]
  [10.     1.25 ]
  [11.     1.375]]]
-----y_b-----------
[ 97  98  99 100 101 102 103]
