In [1]:
import tensorflow as tf
import numpy as np

tf.enable_eager_execution()

In [84]:
def _bytes_feature(value):
    """string / byte 型から byte_listを返す"""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
    """float / double 型から float_listを返す"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """bool / enum / int / uint 型から Int64_listを返す"""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [85]:
# データセットに含まれる観測結果の件数
n_observations = int(1e4)

# ブール特徴量 FalseまたはTrueとしてエンコードされている
feature0 = np.random.choice([False, True], n_observations)

# 整数特徴量  0以上 5未満の乱数
feature1 = np.random.randint(0, 5, n_observations)

# バイト文字列特徴量
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# 浮動小数点数特徴量 標準正規分布から発生
feature3 = np.random.randn(n_observations)

In [86]:
def serialize_example(feature0, feature1, feature2, feature3):
    """
    Creates a tf.Example message ready to be written to a file.
    ファイル出力可能なtf.Exampleメッセージを作成する
    """

    # 特徴量名とtf.Example互換データ型との対応ディクショナリを作成

    feature = {
        'feature0': _int64_feature(feature0),
        'feature1': _int64_feature(feature1),
        'feature2': _bytes_feature(feature2),
        'feature3': _float_feature(feature3),
    }

    # tf.train.Exampleを用いて特徴メッセージを作成

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [108]:
feature3

array([ 0.77358925, -0.96843827,  0.13272646, ..., -0.35716567,
        0.37907058, -1.41676869])

In [116]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1, feature2, feature3))
features_dataset

<DatasetV1Adapter shapes: ((), (), (), ()), types: (tf.bool, tf.int64, tf.string, tf.float64)>

In [117]:
def tf_serialize_example(f0,f1,f2,f3):
    tf_string = tf.py_func(
        serialize_example, 
        (f0,f1,f2,f3),  # pass these args to the above function.
        tf.string)      # the return type is `tf.string`.
    return tf.reshape(tf_string, ()) # The result is a scalar

In [118]:
serialized_features_dataset = features_dataset.map(tf_serialize_example)
serialized_features_dataset

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


<DatasetV1Adapter shapes: (), types: tf.string>

In [122]:
writer = tf.data.experimental.TFRecordWriter('/data/elsa/test.tfrecords')
writer.write(serialized_features_dataset)

InvalidArgumentError: TypeError: Value must be iterable
Traceback (most recent call last):

  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/script_ops.py", line 207, in __call__
    ret = func(*args)

  File "<ipython-input-86-925091f10cef>", line 11, in serialize_example
    'feature1': _int64_feature(feature1),

  File "<ipython-input-84-d65041e40cb5>", line 11, in _int64_feature
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

TypeError: Value must be iterable


	 [[{{node PyFunc}}]] [Op:ExperimentalDatasetToTFRecord]

In [110]:
raw_dataset = tf.data.TFRecordDataset('/data/elsa/test.tfrecords')
raw_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [None]:
for record in raw_dataset.take(1):
    print(repr(record))

In [114]:
feature_description = {
    'feature0': tf.FixedLenFeature([1000], tf.int64),
    'feature1': tf.FixedLenFeature([1000], tf.int64),
    'feature2': tf.FixedLenFeature([1000], tf.string),
    'feature3': tf.FixedLenFeature([1000], tf.float32),
}

def _parse_function(example_proto):
    return tf.parse_single_example(example_proto, feature_description)

In [115]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset 

TypeError: Value passed to parameter 'dense_defaults' has DataType bool not in list of allowed values: float32, int64, string

In [113]:
iterator = parsed_dataset.make_one_shot_iterator()
data = iterator.next()

InvalidArgumentError: Key: feature3.  Can't parse serialized Example.
	 [[{{node ParseSingleExample/ParseSingleExample}}]] [Op:IteratorGetNextSync]

In [2]:
def np_to_tfrecords(X, Y, file_path_prefix, verbose=True):
    """
    Converts a Numpy array (or two Numpy arrays) into a tfrecord file.
    For supervised learning, feed training inputs to X and training labels to Y.
    For unsupervised learning, only feed training inputs to X, and feed None to Y.
    The length of the first dimensions of X and Y should be the number of samples.
    
    Parameters
    ----------
    X : numpy.ndarray of rank 2
        Numpy array for training inputs. Its dtype should be float32, float64, or int64.
        If X has a higher rank, it should be rshape before fed to this function.
    Y : numpy.ndarray of rank 2 or None
        Numpy array for training labels. Its dtype should be float32, float64, or int64.
        None if there is no label array.
    file_path_prefix : str
        The path and name of the resulting tfrecord file to be generated, without '.tfrecords'
    verbose : bool
        If true, progress is reported.
    
    Raises
    ------
    ValueError
        If input type is not float (64 or 32) or int.
    
    """
    def _dtype_feature(ndarray):
        """match appropriate tf.train.Feature class with dtype of ndarray. """
        assert isinstance(ndarray, np.ndarray)
        dtype_ = ndarray.dtype
        if dtype_ == np.float64 or dtype_ == np.float32:
            return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array))
        elif dtype_ == np.int64 or dtype_ == np.int32 or dtype_ == np.uint64 or dtype_ == np.uint32:
            return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array))
        else:  
            raise ValueError("The input should be numpy ndarray. \
                               Instaed got {}".format(ndarray.dtype))
            
    assert isinstance(X, np.ndarray)
    assert len(X.shape) == 2  # If X has a higher rank, 
                               # it should be rshape before fed to this function.
    assert isinstance(Y, np.ndarray) or Y is None
    
    # load appropriate tf.train.Feature class depending on dtype
    dtype_feature_x = _dtype_feature(X)
    if Y is not None:
        assert X.shape[0] == Y.shape[0]
        assert len(Y.shape) == 2
        dtype_feature_y = _dtype_feature(Y)            
    
    # Generate tfrecord writer
    result_tf_file = file_path_prefix + '.tfrecords'
    writer = tf.python_io.TFRecordWriter(result_tf_file)
    if verbose:
        print("Serializing {:d} examples into {}".format(X.shape[0], result_tf_file))
        
    # iterate over each sample,
    # and serialize it as ProtoBuf.
    for idx in range(X.shape[0]):
        x = X[idx]
        if Y is not None:
            y = Y[idx]
        
        d_feature = {}
        d_feature['X'] = dtype_feature_x(x)
        if Y is not None:
            d_feature['Y'] = dtype_feature_y(y)
            
        features = tf.train.Features(feature=d_feature)
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        writer.write(serialized)
    
    if verbose:
        print("Writing {} done!".format(result_tf_file))

In [15]:
X = np.load("/data/elsa/elsa_ja_X.npy", allow_pickle=True).astype(np.uint32)
y = np.load("/data/elsa/elsa_ja_y.npy", allow_pickle=True).astype(np.uint32)

In [53]:
with tf.python_io.TFRecordWriter("/data/elsa/test.tfrecords") as writer:
    for idx in range(len(X)):
        feature = {
            "X": tf.train.Feature(int64_list=tf.train.Int64List(value=X[idx])),
            "Y": tf.train.Feature(int64_list=tf.train.Int64List(value=y[idx]))
        }
        features = tf.train.Features(feature=feature)
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        writer.write(serialized)

In [18]:
#np_to_tfrecords(X, y, "/data/elsa/test")

Serializing 1886697 examples into /data/elsa/test.tfrecords
Writing /data/elsa/test.tfrecords done!


In [54]:
for serialized_example in tf.python_io.tf_record_iterator('/data/elsa/test.tfrecords'):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    print(example)
    break

features {
  feature {
    key: "X"
    value {
      int64_list {
        value: 191
        value: 68
        value: 30
        value: 32
        value: 74
        value: 77
        value: 254
        value: 237
        value: 75
        value: 24
        value: 62
        value: 36
        value: 34
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
        value: 0
      }
    }
  }
  feature {
    key: "Y"
    value {
      int64_

In [67]:
raw_dataset = tf.data.TFRecordDataset('/data/elsa/test.tfrecords')
raw_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [68]:
for record in raw_dataset.take(3):
    print(repr(record))

<tf.Tensor: id=357, shape=(), dtype=string, numpy=b'\n\x8b\x01\n>\n\x01X\x129\x1a7\n5\xbf\x01D\x1e JM\xfe\x01\xed\x01K\x18>$"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nI\n\x01Y\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'>
<tf.Tensor: id=359, shape=(), dtype=string, numpy=b'\n\x8e\x01\nA\n\x01X\x12<\x1a:\n8\xbe\x01\n\xc0\x06\x9c\x1c\xf0\x17\x98$N\xe3\x0b5\x16\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nI\n\x01Y\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [69]:
feature_description = {
    'X': tf.FixedLenFeature([], tf.int64, default_value=0),
    'Y': tf.FixedLenFeature([], tf.int64, default_value=0),
}

def _parse_function(example_proto):
    features = tf.parse_single_example(example_proto, feature_description)
    X = tf.cast(features['X'], tf.int32)
    Y = tf.cast(features['Y'], tf.int32)
    return X, Y

In [70]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset 

<DatasetV1Adapter shapes: ((), ()), types: (tf.int32, tf.int32)>

In [71]:
iterator = parsed_dataset.make_one_shot_iterator()
data = iterator.next()

InvalidArgumentError: Key: Y.  Can't parse serialized Example.
	 [[{{node ParseSingleExample/ParseSingleExample}}]] [Op:IteratorGetNextSync]

In [72]:
for record in parsed_dataset.take(3):
    print(record)

InvalidArgumentError: Key: Y.  Can't parse serialized Example.
	 [[{{node ParseSingleExample/ParseSingleExample}}]] [Op:IteratorGetNextSync]