In [1]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
from dask import dataframe as dd

In [2]:
COLUMNS = list('abcdefghij')
LABELS = 'labels'

In [3]:
base_dir = 'data'
if not os.path.isdir(base_dir):
    os.mkdir(base_dir)
sharded_dir = os.path.join(base_dir, 'sharded')
if not os.path.isdir(sharded_dir):
    os.mkdir(sharded_dir)
tfrecords_dir = os.path.join(base_dir, 'tfrecords')
if not os.path.isdir(tfrecords_dir):
    os.mkdir(tfrecords_dir)

## TFRecords writer

In [4]:
def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
def serialize_example(features, labels):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    # Equivalent to:
    # feature = {
    #     'feature0': _int64_feature(feature0),
    #     'feature1': _int64_feature(feature1),
    #     'feature2': _bytes_feature(feature2),
    #     'feature3': _float_feature(feature3),
    # }
    feats = [_float_feature(x) for x in features]
    feature = dict(zip(COLUMNS, feats))
    feature.update({LABELS: _int64_feature(labels)})

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [5]:
def write_tfrecord(filename, features, labels):
    with tf.io.TFRecordWriter(filename) as writer:
        for idi in range(len(features)):
            example = serialize_example(features[idi], labels[idi])
            writer.write(example)

## Generating synthetic dataset

In [6]:
total = []
for idi in range(1000):
    if not idi%100:
        print(idi)
    features = np.random.rand(1000,10)
    labels = np.random.randint(0, 2, size=(1000))
    df = pd.DataFrame(features, columns=COLUMNS)
    df[LABELS] = labels
    df.to_csv('data/sharded/teste-{}.csv'.format(str(idi).zfill(6)), index=False)
    write_tfrecord(
        'data/tfrecords/teste-{}.tfrecord'.format(str(idi).zfill(6)),
        df.iloc[:,:-1].values,
        df.iloc[:,-1].values,
    )
    total.extend(df.values.tolist())

0
100
200
300
400
500
600
700
800
900


## Visualizing pandas data frame 

In [7]:
df_total = pd.DataFrame(total, columns=COLUMNS+[LABELS])
df_total.to_csv('data/teste-integral.csv', index=False)

In [8]:
df_total.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,labels
0,0.349554,0.541343,0.607645,0.442531,0.020099,0.94921,0.967931,0.564398,0.682192,0.37131,0.0
1,0.590018,0.371522,0.904488,0.967802,0.539744,0.254605,0.928243,0.366421,0.340372,0.117526,1.0
2,0.87726,0.748541,0.143165,0.366931,0.305194,0.502168,0.427763,0.820421,0.12681,0.063527,1.0
3,0.339867,0.593165,0.347725,0.760159,0.622874,0.161114,0.805181,0.008477,0.370492,0.336214,1.0
4,0.672775,0.653081,0.091219,0.939693,0.880691,0.515646,0.979199,0.201749,0.871451,0.683567,1.0


In [9]:
df_total.shape

(1000000, 11)

## Reading a single TFRecords file

In [10]:
raw_dataset = tf.data.TFRecordDataset('data/tfrecords/teste-000000.tfrecord')

In [11]:
for raw_record in raw_dataset.take(1):
    print(repr(raw_record))

<tf.Tensor: shape=(), dtype=string, numpy=b'\n\xa7\x01\n\r\n\x01a\x12\x08\x12\x06\n\x04\xb6\xf8\xb2>\n\r\n\x01b\x12\x08\x12\x06\n\x04{\x95\n?\n\x0f\n\x06labels\x12\x05\x1a\x03\n\x01\x00\n\r\n\x01c\x12\x08\x12\x06\n\x04\x9a\x8e\x1b?\n\r\n\x01d\x12\x08\x12\x06\n\x04u\x93\xe2>\n\r\n\x01e\x12\x08\x12\x06\n\x04\xdb\xa5\xa4<\n\r\n\x01f\x12\x08\x12\x06\n\x04e\xffr?\n\r\n\x01g\x12\x08\x12\x06\n\x04M\xcaw?\n\r\n\x01h\x12\x08\x12\x06\n\x04`|\x10?\n\r\n\x01i\x12\x08\x12\x06\n\x04%\xa4.?\n\r\n\x01j\x12\x08\x12\x06\n\x04[\x1c\xbe>'>


In [12]:
feature_description = dict(zip(COLUMNS, len(COLUMNS)*[tf.io.FixedLenFeature([], tf.float32)]))
feature_description.update({LABELS: tf.io.FixedLenFeature([], tf.int64)})

def _parse_function(example_proto):
    # Parse the input `tf.Example` proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_description)

In [14]:
parsed_dataset = raw_dataset.map(_parse_function)
for parsed_record in parsed_dataset.take(1):
    data = dict([(k,v.numpy()) for k,v in parsed_record.items()])
    print(data)

{'a': 0.34955376, 'b': 0.5413434, 'c': 0.6076447, 'd': 0.44253126, 'e': 0.020098617, 'f': 0.9492095, 'g': 0.9679306, 'h': 0.5643978, 'i': 0.68219215, 'j': 0.37131009, 'labels': 0}
