In [38]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tfx==0.15.0rc0
    print("You can safely ignore the package incompatibility errors.")
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "data"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

So looks like it's gonna be slightly more chllenging material. Let's dive in.

## split the California dataset to multiple CSV files

In [8]:
housing = fetch_california_housing()

In [10]:
dir(housing)

['DESCR', 'data', 'feature_names', 'target']

In [12]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [15]:
# so as specified we have 8 columns and 20640 entries
type(housing.data), housing.data.shape

(numpy.ndarray, (20640, 8))

In [16]:
housing.target.shape

(20640,)

In [19]:
# and those are our median house values
housing.target[:5]

array([4.526, 3.585, 3.521, 3.413, 3.422])

In [20]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [21]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [22]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]

In [24]:
X_train.shape, y_train.shape, train_data.shape

((11610, 8), (11610, 1), (11610, 9))

In [25]:
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

In [26]:
header

'MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue'

In [27]:
train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [29]:
train_filepaths[:5]

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv']

In [31]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


## building an input pipeline

In [34]:
# this is already a dataset - of shuffled filepaths
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [35]:
next(iter(filepath_dataset))

<tf.Tensor: id=72, shape=(), dtype=string, numpy=b'datasets/housing/my_train_05.csv'>

> When we iterate over the interleave dataset, it will cycle through these five TextLineDatasets, reading one line at a time from each until all datasets are out of items.

In [39]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [42]:
for line in dataset.take(5):
    print(line.numpy()[:-10])

b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-11'
b'3.226,52.0,5.372469635627531,0.9473684210526315,1157.0,2.3421052631578947,37.96,-12'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-1'
b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-12'
b'3.0217,22.0,4.983870967741935,1.1008064516129032,615.0,2.4798387096774195,38.76,-1'


In [43]:
n_inputs = 8 # X_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [44]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: id=179, shape=(8,), dtype=float32, numpy=
 array([ 0.16579157,  1.216324  , -0.05204565, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: id=180, shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

## putting everything together

In [46]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    
    # shuffle filepaths
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    
    # create dataset to read from n_readers files
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    
    # shuffle dataset itself (we didn't do this in examples before)
    dataset = dataset.shuffle(shuffle_buffer_size)
    
    # preprocess to get floats using function above
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    
    # batch dataset
    dataset = dataset.batch(batch_size)
    
    # 
    return dataset.prefetch(1)

## using the dataset with `tf.keras`

In [47]:
train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[-0.46256274 -0.12851503 -0.07901246  0.18995905 -0.5122301  -0.444475
   1.3883705  -0.8973971 ]
 [-1.0485971   1.1372159  -0.38121402  0.11287564 -0.55147225  0.6342285
  -0.7853724   0.6918617 ]
 [ 0.66949105  0.82078314  0.28814054 -0.0016847  -0.18734144 -0.06630351
  -0.69636196  0.53193647]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.269]
 [1.743]
 [3.354]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[-0.6634769  -0.8404887   0.06444698  0.25439417 -0.13167231 -0.23412223
  -0.79474217  1.5264715 ]
 [-0.6770422  -0.04940685 -0.7711813  -0.0225677  -0.44560966 -0.39133206
  -1.3522334   1.2316071 ]
 [ 1.2840179  -1.0778131   0.7499082  -0.06391343  0.811965   -0.00787129
  -0.7994279   0.83679074]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[0.682]
 [1.475]
 [3.505]], shape=(3, 1), dtype=float32)



In [48]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [49]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [50]:
model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))

In [51]:
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=valid_set)

Train for 362 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x13f1bb748>

In [52]:
model.evaluate(test_set, steps=len(X_test) // batch_size)



0.5263732366310143

In [54]:
new_set = test_set.map(lambda X, y: X) 
X_new = X_test
model.predict(new_set, steps=len(X_new) // batch_size)

array([[2.8217847],
       [2.4556892],
       [1.2757046],
       ...,
       [2.0653024],
       [1.4653986],
       [2.0322335]], dtype=float32)