### tf.data.Dataset으로 CSV 데이터를 로드하는 방법의 예를 제공. 

In [18]:
import functools

import numpy as np
import tensorflow as tf
import pandas as pd

In [19]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [20]:
np.set_printoptions(precision=3, suppress=True)

In [21]:
train_file_path

'/Users/jk/.keras/datasets/train.csv'

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [31]:
LABEL_COLUMN = "survived"
LABELS = [0, 1]

def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5,
        label_name=LABEL_COLUMN,
        na_value="?",
        ignore_errors=True,
        **kwargs
    )
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [32]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print(f"{key:20s} : {value.numpy()}")

show_batch(raw_train_data)

sex                  : [b'female' b'male' b'male' b'male' b'female']
age                  : [28. 28. 28. 44. 35.]
n_siblings_spouses   : [0 0 0 0 0]
parch                : [0 0 0 0 0]
fare                 : [  7.75    8.05    6.95    7.925 512.329]
class                : [b'Third' b'Third' b'Third' b'Third' b'First']
deck                 : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town          : [b'Queenstown' b'Southampton' b'Queenstown' b'Southampton' b'Cherbourg']
alone                : [b'y' b'y' b'y' b'y' b'y']


In [33]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

sex                  : [b'male' b'male' b'male' b'male' b'female']
age                  : [65. 44. 22. 32. 40.]
n_siblings_spouses   : [0 1 0 0 1]
parch                : [1 0 0 0 1]
fare                 : [ 61.979  26.      9.35    7.925 134.5  ]
class                : [b'First' b'Second' b'Third' b'Third' b'First']
deck                 : [b'B' b'unknown' b'unknown' b'unknown' b'E']
embark_town          : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Cherbourg']
alone                : [b'n' b'n' b'y' b'y' b'n']


In [34]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

age                  : [30.5 47.  24.  28.   2. ]
n_siblings_spouses   : [0 1 2 0 4]
class                : [b'Third' b'Third' b'Second' b'First' b'Third']
deck                 : [b'unknown' b'unknown' b'unknown' b'A' b'unknown']
alone                : [b'y' b'n' b'n' b'y' b'n']


In [35]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS, column_defaults=DEFAULTS)

show_batch(temp_dataset)

age                  : [29. 23. 31. 28. 28.]
n_siblings_spouses   : [0. 0. 1. 0. 1.]
parch                : [0. 0. 0. 0. 1.]
fare                 : [ 9.5   13.792 18.     7.75  15.246]


In [37]:
example_batch, labels_batch = next(iter(temp_dataset))

In [38]:
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

In [39]:
packed_dataset = temp_dataset.map(pack)

# batch 5개, 각 col 4개
for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[ 51.      0.      0.     12.525]
 [ 32.5     1.      0.     30.071]
 [ 17.      0.      2.    110.883]
 [ 24.      0.      0.      7.496]
 [ 18.      0.      0.      7.796]]

[0 0 1 0 0]


In [40]:
example_batch, labels_batch = next(iter(temp_dataset))