<center><font size="10"> 🔥The Data API🔥 </font></center>

In [39]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

In [40]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)

In [41]:
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [42]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [43]:
dataset = dataset.repeat(3).batch(7)

In [44]:
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [45]:
dataset = dataset.map(lambda x: x*2)

In [46]:
dataset = dataset.apply(tf.data.experimental.unbatch())

In [47]:
dataset

<_UnbatchDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [48]:
dataset = dataset.filter(lambda x: x<10)

In [49]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [50]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)

In [51]:
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


#### Split the California dataset to multiple CSV files

In [52]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [53]:
housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1,1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)

In [54]:
scalar = StandardScaler()
scalar.fit(X_train)
X_mean = scalar.mean_
X_std = scalar.scale_

In [55]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [56]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MeadianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [57]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MeadianHouseValue
0,2.9167,39.0,4.544776,1.08209,690.0,5.149254,34.03,-118.18,1.458
1,4.0278,29.0,19.86,4.74,337.0,3.37,39.34,-120.35,0.958
2,5.2305,34.0,6.092006,1.046757,2269.0,3.422323,33.92,-118.01,2.051
3,5.8625,8.0,6.724311,1.042607,1154.0,2.892231,35.64,-117.7,1.094
4,3.1111,17.0,5.690438,1.030794,1904.0,3.0859,37.4,-120.86,1.139


In [58]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [59]:
n_readers = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
            cycle_length = n_readers)

In [60]:
for line in dataset.take(5):
    print(line.numpy())

b'2.3889,44.0,4.933806146572104,1.0567375886524824,1270.0,3.0023640661938535,34.1,-117.37,0.861'
b'1.5727,26.0,2.1515711645101665,1.2458410351201479,1685.0,3.11460258780037,34.05,-118.27,2.25'
b'1.7188,17.0,4.5443645083932855,1.0167865707434052,620.0,1.486810551558753,38.5,-121.47,1.375'
b'4.0417,52.0,6.452830188679245,1.0471698113207548,1301.0,2.4547169811320755,38.56,-121.45,1.738'
b'2.9167,39.0,4.544776119402985,1.0820895522388059,690.0,5.149253731343284,34.03,-118.18,1.458'


In [61]:

n_inputs = 8
@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [62]:
preprocess(b'5.8243,13.0,5.838420107719928,1.0466786355475763,1546.0,2.7755834829443446,33.89,-117.81,2.979')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 1.0132518 , -1.2280904 ,  0.16095456, -0.11020053,  0.10587666,
        -0.04032403, -0.82174623,  0.8847062 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.979], dtype=float32)>)

In [63]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                        n_read_threads=None, shuffle_buffer_size=10000,
                        n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [64]:
tf.random.set_seed(42)

train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[-0.30434313 -1.9404072  -0.5010243   0.15574816  0.07647909 -0.17995383
  -0.9717552   0.939623  ]
 [ 0.9555501   0.90886     0.2674993  -0.43186745 -0.84262735 -0.05205784
  -0.6858001   0.58016884]
 [-0.271558    1.3837379  -0.07966159 -0.41066712 -0.86510783 -0.0649069
  -1.3983434   1.2441603 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.385]
 [3.824]
 [1.363]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[ 1.4322196   0.11739691  0.57615894 -0.2854161  -0.5270358   0.02668265
   0.77678794 -1.1771617 ]
 [-0.694355   -1.5446756  -0.4496307  -0.12867594  2.6176398   0.00634006
  -1.4405335   1.24915   ]
 [ 1.2121134  -0.5157736   0.44099787 -0.15506227 -0.1215222  -0.02674748
   1.2033762  -1.5216354 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[3.325]
 [0.883]
 [2.911]], shape=(3, 1), dtype=float32)



In [65]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [66]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [67]:
model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))

In [68]:
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
            validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x274948ab970>

In [69]:
model.evaluate(test_set, steps=len(X_test) // batch_size)



0.5174529552459717

In [70]:
new_set = test_set.map(lambda X, y: X) # we could instead just pass test_set, Keras would ignore the labels
X_new = X_test
model.predict(new_set, steps=len(X_new) // batch_size)



array([[3.8950562],
       [2.2644744],
       [1.09518  ],
       ...,
       [3.345035 ],
       [1.426414 ],
       [3.3756304]], dtype=float32)

#### TF Record format

In [71]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"this is the first record")
    f.write(b'and this the second')

In [72]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'this is the first record', shape=(), dtype=string)
tf.Tensor(b'and this the second', shape=(), dtype=string)


In [73]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write(b"this is the first record")
    f.write(b'and this the second')

dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],
                                compression_type="GZIP")
for item in dataset:
    print(item)

tf.Tensor(b'this is the first record', shape=(), dtype=string)
tf.Tensor(b'and this the second', shape=(), dtype=string)


#### Encoding Categorical Features Using Embeddings

In [74]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [75]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.6645621 , 0.44100678],
       [0.3528825 , 0.46448255],
       [0.03366041, 0.68467236],
       [0.74011743, 0.8724445 ],
       [0.22632635, 0.22319686],
       [0.3103881 , 0.7223358 ],
       [0.13318717, 0.5480639 ]], dtype=float32)>

In [76]:
categories = tf.constant(["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 1, 2, 3, 4], dtype=int64)>

In [77]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.6645621 , 0.44100678],
       [0.3528825 , 0.46448255],
       [0.03366041, 0.68467236],
       [0.74011743, 0.8724445 ],
       [0.22632635, 0.22319686]], dtype=float32)>