In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tfx==0.21.2
    print("You can safely ignore the package incompatibility errors.")
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "data"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

In [18]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [9]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [10]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [20]:
dataset = dataset.map(lambda x: x *2)
dataset

<MapDataset shapes: (), types: tf.int32>

In [21]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [24]:
dataset = dataset.apply(tf.data.Dataset.unbatch())
dataset

TypeError: unbatch() missing 1 required positional argument: 'self'

In [25]:
dataset = dataset.filter(lambda x: x < 10)


In [31]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [32]:
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


In [33]:
dataset = dataset.repeat(2)
for item in dataset:
    print(item)

tf.Tensor([3 5 2 1 8 4 0], shape=(7,), dtype=int64)
tf.Tensor([1 2 3 7 9 5 0], shape=(7,), dtype=int64)
tf.Tensor([7 4 0 9 6 8 1], shape=(7,), dtype=int64)
tf.Tensor([2 6 6 4 9 8 5], shape=(7,), dtype=int64)
tf.Tensor([7 3], shape=(2,), dtype=int64)
tf.Tensor([2 1 3 5 8 9 4], shape=(7,), dtype=int64)
tf.Tensor([1 0 7 3 5 6 7], shape=(7,), dtype=int64)
tf.Tensor([2 4 9 1 0 3 6], shape=(7,), dtype=int64)
tf.Tensor([0 4 7 5 2 8 6], shape=(7,), dtype=int64)
tf.Tensor([9 8], shape=(2,), dtype=int64)


In [36]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [35]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [37]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [38]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [40]:
train_filepaths

['datasets\\housing\\my_train_00.csv',
 'datasets\\housing\\my_train_01.csv',
 'datasets\\housing\\my_train_02.csv',
 'datasets\\housing\\my_train_03.csv',
 'datasets\\housing\\my_train_04.csv',
 'datasets\\housing\\my_train_05.csv',
 'datasets\\housing\\my_train_06.csv',
 'datasets\\housing\\my_train_07.csv',
 'datasets\\housing\\my_train_08.csv',
 'datasets\\housing\\my_train_09.csv',
 'datasets\\housing\\my_train_10.csv',
 'datasets\\housing\\my_train_11.csv',
 'datasets\\housing\\my_train_12.csv',
 'datasets\\housing\\my_train_13.csv',
 'datasets\\housing\\my_train_14.csv',
 'datasets\\housing\\my_train_15.csv',
 'datasets\\housing\\my_train_16.csv',
 'datasets\\housing\\my_train_17.csv',
 'datasets\\housing\\my_train_18.csv',
 'datasets\\housing\\my_train_19.csv']

In [41]:
n_readers = 5
# five files
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers
)
# testLineDataset represent one file

In [44]:
# These are the first rows (ignoring the header row) of five CSV files,
# chosen randomly
for line in dataset.take(5):
    print(line.numpy())

b'2.1856,41.0,3.7189873417721517,1.0658227848101265,803.0,2.0329113924050635,32.76,-117.12,1.205'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67'
b'3.8456,35.0,5.461346633416459,0.9576059850374065,1154.0,2.8778054862842892,37.96,-122.05,1.598'
b'3.0217,22.0,4.983870967741935,1.1008064516129032,615.0,2.4798387096774195,38.76,-120.6,1.069'
b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'


In [47]:
n_inputs = 8

In [46]:
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
# 8 features are 0 and add target column's value. it's type is float
# but add [] empty array as a default so this will raise exception when
# the target value is missing.
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [48]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: id=394, shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: id=390, shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

In [54]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                      n_read_threads=None, shuffle_buffer_size=10000,
                      n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [55]:
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [56]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [57]:
model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))

In [58]:
model.fit(train_set, epochs=10, validation_data=valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d9576b2708>

In [59]:
model.evaluate(test_set)



0.47523699001765546

In [60]:
new_set = test_set.take(3).map(lambda X, y: X)
model.predict(new_set)

array([[1.8386662 ],
       [6.143565  ],
       [2.567878  ],
       [2.402985  ],
       [3.4025168 ],
       [2.9983978 ],
       [2.2575357 ],
       [1.9748042 ],
       [1.6136684 ],
       [2.486779  ],
       [1.5879711 ],
       [1.2807317 ],
       [2.383127  ],
       [1.7314672 ],
       [1.7048581 ],
       [2.2575994 ],
       [1.5767802 ],
       [1.7274926 ],
       [2.2455769 ],
       [2.0362818 ],
       [1.2295339 ],
       [2.39773   ],
       [1.0965221 ],
       [1.5630789 ],
       [2.2400577 ],
       [1.9451852 ],
       [1.494256  ],
       [0.94568574],
       [2.1349065 ],
       [1.187289  ],
       [0.7869146 ],
       [2.200666  ],
       [1.9776423 ],
       [1.8899345 ],
       [2.7460988 ],
       [2.645436  ],
       [1.1945031 ],
       [0.9016325 ],
       [1.7276757 ],
       [2.8179545 ],
       [2.360052  ],
       [2.1828446 ],
       [0.77150655],
       [1.878555  ],
       [1.5641625 ],
       [2.3188696 ],
       [3.0084388 ],
       [1.534

In [62]:
for X_batch, y_batch in train_set:
    print(X_batch.shape)

(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)
(32, 8)


In [63]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

In [64]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [65]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write(b"hello world")

In [66]:
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],
                                 compression_type="GZIP")
for item in dataset:
    print(item)

tf.Tensor(b'hello world', shape=(), dtype=string)


In [78]:
eps = keras.backend.epsilon()
eps

1e-07

In [79]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [83]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: id=12969, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [85]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: id=12977, shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [88]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets,
embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.803156  , 0.49777734],
       [0.37054038, 0.9118674 ],
       [0.637642  , 0.18209696],
       [0.63791955, 0.27701473],
       [0.04227114, 0.84219384],
       [0.90637195, 0.222556  ],
       [0.9198462 , 0.68789077]], dtype=float32)>

In [90]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])

In [91]:
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: id=13024, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [94]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: id=13026, shape=(4, 2), dtype=float32, numpy=
array([[0.63791955, 0.27701473],
       [0.90637195, 0.222556  ],
       [0.37054038, 0.9118674 ],
       [0.37054038, 0.9118674 ]], dtype=float32)>

In [99]:
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"