# Split Data into Features and Labels

This notebook covers how to manipulate a dataset using [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) and all the transformations available such as: 
- batch 
- prefetch 
- flat_map
- map 
- windows

In [1]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [2]:
# Let's install tensorflow 2.x first :)
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [3]:
print(tf.__version__)

2.0.0


In [21]:
# Creates a Dataset of a step-separated range of values.
# Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable#range
dataset = tf.data.Dataset.range(10)
for val in dataset:
    print(val.numpy())

0
1
2
3
4
5
6
7
8
9


In [26]:
# Creates a Dataset, and groups it by window
# Window: Combines (nests of) input elements into a dataset of (nests of) windows.
# Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable#window
dataset = tf.data.Dataset.range(10)
# size of the window: group by this amount
# shift: by how many to shift on each iteration
dataset = dataset.window(size=5, shift=1) 
for window_dataset in dataset:
    for val in window_dataset:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 
6 7 8 9 
7 8 9 
8 9 
9 


In [34]:
# Creates a Dataset, and groups it by window but dropping the windows

# Dropped windows
"""
6 7 8 9 
7 8 9 
8 9 
9 
"""
dataset = tf.data.Dataset.range(10)
# drop_remainder: window will be dropped if its size is smaller than size defined
dataset = dataset.window(size=5, shift=1, drop_remainder=True)
for window_dataset in dataset:
    for val in window_dataset:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [36]:
# Use of flatmap: maps function across this dataset and flattens the result. Preserves order
# Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable#flat_map
# Function passed is to group is batch, which combines consecutive elements of this dataset into batches.
dataset_flatmap = dataset.flat_map(lambda window: window.batch(5))
for window in dataset_flatmap:
    print(window.numpy())

[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]


In [37]:
# Take the batches produced by flatmap and split between feature and label
# Map: takes a function and apply it to the dataset, element wise operation
# Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable#map
dataset_split_feature_label = dataset_flatmap.map(lambda window: (window[:-1], window[-1:]))
for features,label in dataset_split_feature_label:
    print(features.numpy(), label.numpy())

[0 1 2 3] [4]
[1 2 3 4] [5]
[2 3 4 5] [6]
[3 4 5 6] [7]
[4 5 6 7] [8]
[5 6 7 8] [9]


In [40]:
# Shuffle the dataset as to avoid sequence bias. 
# Buffer size: number of elements from dataset from which the new dataset will sample.
# Ref: https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable#shuffle
dataset_shuffled = dataset_split_feature_label.shuffle(buffer_size=10)
for features,label in dataset_shuffled:
    print(features.numpy(), label.numpy())

[2 3 4 5] [6]
[5 6 7 8] [9]
[4 5 6 7] [8]
[0 1 2 3] [4]
[1 2 3 4] [5]
[3 4 5 6] [7]


In [52]:
# Prefetch 1 example of the batch size 2
dataset_by_batch = dataset_shuffled.batch(3).prefetch(1)
for features,label in dataset_by_batch:
    print("features: \n", features.numpy())
    print("labels: \n", label.numpy())

features: 
 [[5 6 7 8]
 [4 5 6 7]
 [2 3 4 5]]
labels: 
 [[9]
 [8]
 [6]]
features: 
 [[3 4 5 6]
 [1 2 3 4]
 [0 1 2 3]]
labels: 
 [[7]
 [5]
 [4]]
