In [None]:
# 2020-10-27 created by Akson

In [None]:
# Code13.1
# 玩具数据集

import tensorflow as tf

X = tf.range(10)
print(X)
dataset = tf.data.Dataset.from_tensor_slices(X)
print(dataset)

for item in dataset:
    print(item)

In [None]:
# Code13.2
# 链式转换

dataset = tf.data.Dataset.range(10)
dataset = dataset.repeat(3).batch(7, drop_remainder = True)
for item in dataset:
    print(item)

In [None]:
# Code13.3
# map

dataset = tf.data.Dataset.range(10)
dataset = dataset.map(lambda x: x * 2)
for item in dataset:
    print(item)

In [None]:
# Code13.4
# apply()

dataset = tf.data.Dataset.range(10)
dataset = dataset.repeat(3).batch(7, drop_remainder = True)

# 以下函数在tensorflow2.3.0版本失效
# dataset = dataset.apply(tf.data.experimental.unbatch())

dataset = dataset.unbatch()

for item in dataset:
    print(item)

In [None]:
# Code13.5
# filter

dataset = dataset.map(lambda x: x * 2)
dataset = dataset.filter(lambda x: x < 10)

for item in dataset.take(5):
    print(item)

In [None]:
# Code13.6
# shuffle

dataset = tf.data.Dataset.range(10).repeat(3)

for item in dataset:
    # print(item)
    pass

dataset = dataset.shuffle(buffer_size = 5, seed = 42).batch(7)
for item in dataset:
    print(item)

In [None]:
# Code13.7
# 加载加州住房数据

import os
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 使用api抓取数据
housing = fetch_california_housing()
# 将数据集拆分成训练集与测试集
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
# 将训练集拆分成训练集与验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

# 定义归一化对象
scaler = StandardScaler()
# 归一化拟合训练集的模式
scaler.fit(X_train)
# 归一化后每个特征的均值
X_mean = scaler.mean_
# 归一化后每个特征的标准差
X_std = scaler.scale_

# 将数据存储为文件的函数
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    # 获取目录
    housing_dir = os.path.join("./dataset", "house")
    # 如果没有则创建目录
    os.makedirs(housing_dir, exist_ok=True)
    # 定义文件名格式
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
    # 文件路径列表
    filepaths = []
    # 数据长度
    m = len(data)
    # 间数据拆分成多份分别写入数据文件
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

# 数据变成1维
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [None]:
# Code13.8
# 交织数据

print(train_filepaths)
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed = 42)

n_readers = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length = n_readers)

for line in dataset.take(5):
    print(line)

In [None]:
# Code13.9
# 一些预处理

n_inputs = 8

def preprocess(line):
    # 定义默认格式，首先是8个浮点数零作为缺省值，最后一列作为标签的缺省值，是一个32位浮点数，但没有赋值
    defs = [0.] * n_inputs + [tf.constant([], dtype = tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults = defs)
    # stack()将张量堆叠为一维数组
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    
    # 返回处理过的元组
    return (x - X_mean) / X_std, y

In [None]:
# Code13.10
# 整合以上步骤

def csv_reader_dataset(filepaths, repeat = 1, n_readers = 5, n_read_threads = None, shuffle_buffer_size = 10000, n_parse_threads = 5, batch_size = 32):
    # 首先从文件路径列表中获取到所有文件
    dataset = tf.data.Dataset.list_files(filepaths)
    # 然后交织读取文件列表里每个文件中的每一行（跳过标题）
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length = n_readers, num_parallel_calls = n_read_threads)
    # 对数据集中的每一行内容做预处理（将本来字节的数据格式变成张量并做标准化处理）
    dataset = dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    # 对数据集中的内容做乱序处理
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    
    return dataset.batch(batch_size).prefetch(1)

In [None]:
# Code13.11
# 获取数据集

train_set = csv_reader_dataset(train_filepaths, repeat = None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [None]:
# Code13.12
# training

import tensorflow.keras as keras

model = keras.models.Sequential([
    keras.layers.Dense(30, activation = 'relu', input_shape = X_train.shape[1:]),
    keras.layers.Dense(1)
])

optimizer = keras.optimizers.SGD(lr = 1e-3)
model.compile(loss = 'mae', optimizer = optimizer)

batch_size = 32
model.fit(train_set, steps_per_epoch = len(X_train) // batch_size, epochs = 10, validation_data = valid_set)

In [None]:
# Code13.13
# TFRecord

with tf.io.TFRecordWriter('my_data.tfrecord') as f:
    f.write(b'This is my first record')
    f.write(b'And this is my second record')

filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

In [None]:
# Code13.14
# 压缩的

options = tf.io.TFRecordOptions(compression_type = 'GZIP')
with tf.io.TFRecordWriter('my_compressed.tfrecord', options) as f:
    f.write(b'This is my first record')
    f.write(b'And this is my second record')
    
dataset = tf.data.TFRecordDataset(['my_compressed.tfrecord'], compression_type = 'GZIP')

for item in dataset:
    print(item)

In [None]:
# Code13.15

BytesList = tf.train.BytesList
FloatList = tf.train.FloatList
Int64List = tf.train.Int64List
Feature = tf.train.Feature
Features = tf.train.Features
Example = tf.train.Example

person_example = Example(
    features = Features(
        feature = {
            'name': Feature(bytes_list = BytesList(value = [b'Alice'])),
            'id': Feature(int64_list = Int64List(value = [123])),
            'emails': Feature(bytes_list = BytesList(value = [b'a@b.com', b'c@d.com']))
        }))

In [None]:
# Code13.16
# 对Example对象进行序列化并保存到文件中

with tf.io.TFRecordWriter('my_contacts.tfrecord') as f:
    f.write(person_example.SerializeToString())

In [None]:
# Code13.17
# 从文件中加载

# 定义描述字典
feature_description = {
    'name': tf.io.FixedLenFeature([], tf.string, default_value = ''),
    'id': tf.io.FixedLenFeature([], tf.int64, default_value = 0),
    'emails': tf.io.VarLenFeature(tf.string)
}

for serialized_example in tf.data.TFRecordDataset(['my_contacts.tfrecord']):
    parsed_example = tf.io.parse_single_example(serialized_example, feature_description)

In [None]:
# Code13.18
# 也可以整批加载

dataset = tf.data.TFRecordDataset(['my_contacts.tfrecord']).batch(10)
for serialized_examples in dataset:
    parsed_examples = tf.io.parse_example(serialized_examples, feature_description)

In [None]:
# Code13.19
# 使用tensorflow_datasets

# 需要在当前python环境下输入以下命令来安装
# pip install tensorflow_datasets

import tensorflow_datasets as tfds

dataset = tfds.load(name = 'mnist', batch_size = 32, as_supervised = True)
mnist_train = dataset['train'].repeat().prefetch(1)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape = [28, 28, 1]),
    keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)),
    keras.layers.Dense(10, activation = 'softmax')
])

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = keras.optimizers.SGD(lr = 1e-3), metrics = ['accuracy'])
model.fit(mnist_train, steps_per_epoch = 60000 // 32, epochs = 5)