In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0-alpha0
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.2
pandas 0.25.0
sklearn 0.21.3
tensorflow 2.0.0-alpha0
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()


In [3]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)


In [5]:
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(
            np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train", header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid", header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test", header_str, n_parts=10)

In [7]:
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_11.csv', 

In [9]:
n_readers = 1
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)
for line in dataset.take(15):
    print(line.numpy())

b'0.09734603446040174,0.7527628439249472,-0.20218964416999152,-0.1954700015215477,-0.4060513603629498,0.006785531677655949,-0.813715166526018,0.656614793197258,1.119'
b'-1.4803330571456954,-0.6890414153725881,-0.35624704887282904,-0.1725588908792445,-0.8215884329530113,-0.1382309124854157,1.9157132913404298,-1.0211904224385344,0.928'
b'0.21174628471128154,1.1532640270631513,-0.2507761334605016,-0.2564987121705146,-0.6473894854916754,0.017590216427099285,0.7959477701644521,-1.1510205879341566,1.935'
b'-1.1664916626521322,0.6726626072973063,-0.5583206747658317,0.17064335435039116,-0.5866013111171469,-0.053949832270862016,-0.3471461993693604,1.2758048132533288,0.616'
b'-0.5907072638579991,-1.6502442549042784,-0.11462618259062662,-0.006497285865722548,0.4812745282682295,-0.07270136882136606,1.010569495056515,-0.806471302580379,1.75'
b'2.366275450474818,0.031860714276179515,0.5277562229741544,-0.08874241111281875,0.6854139198244824,-0.10424920030178002,-0.7110699937515539,0.5916997104494468

In [11]:
sample_str = '1,2,3,4,5'
record_defaults = [
    tf.constant(0, dtype=tf.int32),
    0,
    np.nan,
    "hello",
    tf.constant([])
]
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: id=182, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=183, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=184, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=185, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=186, shape=(), dtype=float32, numpy=5.0>]


In [15]:
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'2.8294179633675265,1.1532640270631513,0.657967217553772,-0.13548259707636043,-0.30897173860064286,-0.06682073986323142,0.8379389772085495,-1.2758572855261054,5.00001', n_fields=9)

(<tf.Tensor: id=303, shape=(8,), dtype=float32, numpy=
 array([ 2.829418  ,  1.153264  ,  0.6579672 , -0.1354826 , -0.30897173,
        -0.06682074,  0.83793896, -1.2758573 ], dtype=float32)>,
 <tf.Tensor: id=304, shape=(1,), dtype=float32, numpy=array([5.00001], dtype=float32)>)

In [16]:
import pprint
def csv_reader_dataset(filenames, n_readers=5, batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)
    

x:
<tf.Tensor: id=379, shape=(3, 8), dtype=float32, numpy=
array([[-8.2195884e-01,  1.8741661e+00,  1.8212350e-01, -3.1700194e-02,
        -6.0111791e-01, -1.4337493e-01,  1.0852206e+00, -8.6139947e-01],
       [-3.2652634e-01,  4.3236190e-01, -9.3454592e-02, -8.4029920e-02,
         8.4600359e-01, -2.6631648e-02, -5.6176794e-01,  1.4228760e-01],
       [ 4.9710345e-02, -8.4924191e-01, -6.2146995e-02,  1.7878747e-01,
        -8.0253541e-01,  5.0660671e-04,  6.4664572e-01, -1.1060793e+00]],
      dtype=float32)>
y:
<tf.Tensor: id=380, shape=(3, 1), dtype=float32, numpy=
array([[1.054],
       [2.431],
       [2.286]], dtype=float32)>
x:
<tf.Tensor: id=383, shape=(3, 8), dtype=float32, numpy=
array([[ 2.5150437 ,  1.0731637 ,  0.5574401 , -0.17273512, -0.6129126 ,
        -0.01909157, -0.5710993 , -0.02749031],
       [ 0.63636464, -1.0895426 ,  0.09260903, -0.20538124,  1.2025671 ,
        -0.03630123, -0.6784102 ,  0.18223535],
       [-0.46794146, -0.92934215,  0.11909926, -0.06047011

In [18]:
batch_size = 32
train_set = csv_reader_dataset(train_filenames, batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size =  batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size)


In [19]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                      input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error",optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
             patience=5, min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


In [20]:
model.evaluate(test_set, steps = 5160 // batch_size)



0.4463289657272167