In [108]:
from tensorflow.data import Dataset
import tensorflow as tf

In [20]:
dataset = Dataset.range(10)

In [21]:
dataset = dataset.shuffle(buffer_size=1).batch(5)
for data in dataset:
    print(data)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


In [103]:
class RunName():
    def __init__(self):
        self.num = 0
        return
    
    def __call__(self, fn, suffix):
        self.num += 1
        return f'{fn}_{self.num:04d}{suffix}'

## Prepare housing data by splitting it to multiple csv files

In [104]:
## Prepare housing split dataset to test Dataset API
import os
import csv

FP = './datasets/housing_split/housing.csv'
SPLIT_PATH = './datasets/housing_split/train_datasets'
os.makedirs(SPLIT_PATH, exist_ok=True)

SPLIT_FN = 'housing'
# SPLIT_FP = os.path.join(SPLIT_PATH, SPLIT_FN)

n_rows = 8

with open(FP, 'r') as f:
    reader = csv.reader(f)
    attrib_name = next(reader)
    run_name = RunName()

    for num_line, line in enumerate(reader):
        
        if num_line % n_rows == 0:
            run_split_fn = run_name(SPLIT_FN, '.csv')
            run_split_fp = os.path.join(SPLIT_PATH, run_split_fn)
            
            with open(run_split_fp, 'w') as fp:
                writer = csv.writer(fp)
                writer.writerow(attrib_name)
        
        with open(run_split_fp, 'a') as fp:
            writer = csv.writer(fp)
            writer.writerow(line[:-1])

## Using dataset api from tensorflow to process data from disk

In [140]:
tf.random.set_seed(42)

list_files = Dataset.list_files(SPLIT_PATH + '/*.csv')

for data in list_files.take(5):
    print(data)

tf.Tensor(b'./datasets/housing_split/train_datasets/housing_2356.csv', shape=(), dtype=string)
tf.Tensor(b'./datasets/housing_split/train_datasets/housing_0449.csv', shape=(), dtype=string)
tf.Tensor(b'./datasets/housing_split/train_datasets/housing_0733.csv', shape=(), dtype=string)
tf.Tensor(b'./datasets/housing_split/train_datasets/housing_0601.csv', shape=(), dtype=string)
tf.Tensor(b'./datasets/housing_split/train_datasets/housing_2031.csv', shape=(), dtype=string)


In [174]:
def parse_text_line(fp):
    return tf.data.TextLineDataset(fp).skip(1)

interleave = list_files.interleave(parse_text_line, cycle_length = 5)

for data in interleave.take(19):
    print(data)

tf.Tensor(b'-119.22,35.68,16.0,2874.0,677.0,3078.0,651.0,1.8843,55200.0', shape=(), dtype=string)
tf.Tensor(b'-117.89,33.71,23.0,1422.0,260.0,1092.0,263.0,4.7422,202400.0', shape=(), dtype=string)


NotFoundError: ./datasets/housing_split/train_datasets/housing_1924.csv; No such file or directory

In [158]:
def parse_bin_to_float(tensor):
    default = [0.]*8 + [tf.constant([], dtype = tf.float32)]
    tensors = tf.io.decode_csv(tensor, record_defaults=default)
    return tensors[:-1], tensors[-1]

mapped_dataset = interleave.map(parse_bin_to_float)

for data in mapped_dataset.take(2):
    print(data)

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-120.42  ,   34.89  ,   24.    , 2020.    ,  307.    ,  855.    ,
        283.    ,    5.0099], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=162500.0>)
(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-1.2197e+02,  3.7310e+01,  2.1000e+01,  7.6280e+03,  2.1660e+03,
        3.6370e+03,  1.7490e+03,  3.6401e+00], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=267500.0>)


In [162]:
shuffled_dataset = mapped_dataset.shuffle(buffer_size = 100).batch(32).prefetch(2)

for data in shuffled_dataset.take(1):
    print(len(data))

2


In [175]:
## Aggregate to one function
def csv_generator(fp):
    
    tf.random.set_seed(42)
    list_files = Dataset.list_files(fp + '/*.csv')
    interleave = list_files.interleave(
        parse_text_line, cycle_length = 5
    )
    mapped_dataset = interleave.map(parse_bin_to_float)
    shuffled_dataset = mapped_dataset.shuffle(buffer_size = 100).batch(32).prefetch(2)
    
    return shuffled_dataset

## Now, we ready for training the model!

In [176]:
from tensorflow.keras.layers import BatchNormalization, Dense
from tensorflow.keras.models import Sequential

In [188]:
tf.keras.backend.clear_session()

model = Sequential([
    BatchNormalization(input_shape=[8]),
    Dense(100, activation = 'elu'),
    Dense(100, activation = 'elu'),
    Dense(1)
])

model.compile(loss='huber', metrics=['mae'], optimizer = 'adam')

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 8)                 32        
_________________________________________________________________
dense (Dense)                (None, 100)               900       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 11,133
Trainable params: 11,117
Non-trainable params: 16
_________________________________________________________________


In [189]:
TRAIN_PATH = './datasets/housing_split/train_datasets'
VALID_PATH = './datasets/housing_split/valid_datasets'
TEST_PATH = './datasets/housing_split/test_datasets'

train_dataset = csv_generator(TRAIN_PATH)
valid_dataset = csv_generator(VALID_PATH)
test_dataset = csv_generator(TEST_PATH)

model.fit(train_dataset,
          validation_data=valid_dataset, 
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbaa35ce3d0>

In [185]:
model.layers[0].gamma

<tf.Variable 'batch_normalization/gamma:0' shape=(8,) dtype=float32, numpy=
array([-0.80142474,  1.4805186 , -0.62859046,  1.6189218 ,  0.7868619 ,
        1.8292588 ,  1.080133  , -2.22362   ], dtype=float32)>