In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

# The Data API
- 

## Chaining Transformations
- While the method applies a transformation to each item, the method map() apply() applies a transformation to the dataset as a whole.
- map()
- apply()
- filter()
- take()
- shuffle()

In [3]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [5]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [6]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [None]:
dataset

## Interleaving lines from multiple files

In [7]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                        n_read_threads=None, shuffle_buffer_size=10000,
                        n_parse_threads=5, batch_size=32):
    
    # list_files: shullfle the filepaths
    dataset = tf.data.Dataset.list_files(filepaths)
    
    # 从shuffled datasets 中选择5个（n_readers = 5）,并且交叉生成5个datasets
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                 cycle_length=n_readers, num_parallel_calls=n_read_threads)
    
    # 自定义预处理函数
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    
    
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    
    # prefetch: ensure that loading and preprocessing are multithreaded
    return dataset.batch(batch_size).prefetch(1)

# TFRecord 
- The TFRecord format is TensorFlow’s preferred format for storing large amounts of data and reading it efficiently.

# Preprocessing the Input Features
- keras.layers.Normalization layer will probably be available
- With the Data API, TFRecords, the Keras preprocessing layers, and TF Transform, you can build highly scalable input pipelines for training and benefit from fast and portable data preprocessing in production.

## Encoding Categorical Features Using One-Hot Vectors
- Why use oov buckets? Well, if the number of categories is large (e.g., zip codes, cities, words, products, or users) and the dataset is large as well, or it keeps changing, then getting the full list of categories may not be convenient.

```
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
```



## Encoding Categorical Features Using Embeddings
- This is called representation learning

## Keras Preprocessing Layers
- In both cases, you create the layer, you call its adapt() method with a data sample, and then you use the layer normally in your model.
- keras.layers.Discretization layer that will chop continuous data into different bins and encode each bin as a one-hot vector
- TextVectorization
    - word-count vectors instead of word indices， called bag of words.
    - TFIDF

## TF Transform
- As mentioned earlier, if the dataset is small enough to fit in RAM, you can use its cache() method. But if it is too large, then tools like Apache Beam or Spark will help.
- define your preprocessing operations just once