## Tensorflow Dataset 基础API


- tf.train.FloatList
- tf.train.Int64List
- tf.train.BytesList
- tf.train.Feature
- tf.train.Features
- tf.train.Example
- example.SerializeToString
- tf.io.ParseSingleExample
- tf.io.VarLenFeature
- tf.io.FixedLenFeature
- tf.data.TFRecordDataset
- tf.io.TFRecordOptions

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.3.0
sys.version_info(major=3, minor=7, micro=11, releaselevel='final', serial=0)
matplotlib 3.4.2
numpy 1.18.5
pandas 1.3.3
sklearn 1.0
tensorflow 2.3.0
tensorflow.keras 2.4.0


### 1. 创建数据集

#### 1.1 创建数据集

In [3]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(6));
print(dataset)

<TensorSliceDataset shapes: (), types: tf.int32>


In [8]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)


#### 1.2 repeat 和 batch

- repeat(n) : 重复遍历 n 次dataset, 每次都会返回新的dataset, 返回3个dataset
- batch(size): 返回指定 size 的数据, 在模型训练的时候, 不会一下子将所有的数据读取进模型训练, 而是分批次训练

In [17]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(2))
dataset = dataset.repeat(3)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)


In [22]:
# 指定批次的数据
dataset = tf.data.Dataset.from_tensor_slices(np.arange(6))
dataset = dataset.repeat(3).batch(5)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int32)
tf.Tensor([5 0 1 2 3], shape=(5,), dtype=int32)
tf.Tensor([4 5 0 1 2], shape=(5,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)


#### 1.3 interleave 生成数据集

- dataset.interleave :
    1. 通过对数据集进行处理
    2. 生成新的数据集
- 参数 :
    1. cycle_length
    2. block_length

In [29]:
dataset_inter = dataset.interleave(
    lambda x: tf.data.Dataset.from_tensor_slices(x),  # map_func
    cycle_length=5,
    block_length=3
)

for element in dataset_inter:
    print(element)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


#### 1.4 使用元组创建数据集

In [36]:
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
y = np.array(["dog", "cat", "block"])

# 使用x, y 创建数据集 (x,y) => tuple()
dataset = tf.data.Dataset.from_tensor_slices((x,y))
print(dataset)
print()

# 遍历数据

for item_x, item_y in dataset:
    print(item_x, item_y)
    print(item_x.numpy(), item_y.numpy())
    print()

<TensorSliceDataset shapes: ((3,), ()), types: (tf.int32, tf.string)>

tf.Tensor([1 2 3], shape=(3,), dtype=int32) tf.Tensor(b'dog', shape=(), dtype=string)
[1 2 3] b'dog'

tf.Tensor([4 5 6], shape=(3,), dtype=int32) tf.Tensor(b'cat', shape=(), dtype=string)
[4 5 6] b'cat'

tf.Tensor([7 8 9], shape=(3,), dtype=int32) tf.Tensor(b'block', shape=(), dtype=string)
[7 8 9] b'block'



#### 1.5 使用字典创建数据集

In [41]:
dataset_dict = tf.data.Dataset.from_tensor_slices({'feature': x, 'label': y})

for element in dataset_dict:
    print(element)
    print(element['feature'], element['label'], '\n')


{'feature': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 3])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'dog'>}
tf.Tensor([1 2 3], shape=(3,), dtype=int32) tf.Tensor(b'dog', shape=(), dtype=string) 

{'feature': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([4, 5, 6])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'cat'>}
tf.Tensor([4 5 6], shape=(3,), dtype=int32) tf.Tensor(b'cat', shape=(), dtype=string) 

{'feature': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([7, 8, 9])>, 'label': <tf.Tensor: shape=(), dtype=string, numpy=b'block'>}
tf.Tensor([7 8 9], shape=(3,), dtype=int32) tf.Tensor(b'block', shape=(), dtype=string) 



### 2. 生成 CSV 文件

#### 2.1 读取数据集

In [44]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

print(housing.data.shape, housing.target.shape)

(20640, 8) (20640,)


In [45]:
# 分割数据集
from sklearn.model_selection import train_test_split

# random_state 随机数种子
# test_size : 测试集占的比例,范围是 0 ~ 1
x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing['data'], housing['target'], test_size=0.2,random_state=22
)

print(x_train_all.shape , y_train_all.shape)
print(x_test.shape , y_test.shape)

(16512, 8) (16512,)
(4128, 8) (4128,)


####  2.2 生成CSV文件