In [1]:
import functools

import numpy as np
import tensorflow as tf

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [4]:
# numpyの値を読みやすくする
np.set_printoptions(precision=3, suppress=True)

In [5]:
LABELS = [0, 1]
LABEL_COLUMN = "survived"

In [8]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5,
        label_name = LABEL_COLUMN,
        na_value="?",
        num_epochs=1,
        ignore_errors=True,
        **kwargs)
    return dataset

def show_batch(dataset):
    for batch, label in dataset.take(1): #labelどこで使ってる？？
        for key, value in batch.items():
            print("{:20s}: {}".format(key,value.numpy()))

In [9]:
raw_train_data = get_dataset(train_file_path) #形がわからん
raw_test_data = get_dataset(test_file_path)

In [10]:
show_batch(raw_train_data)

sex                 : [b'male' b'female' b'male' b'female' b'male']
age                 : [28. 28. 31. 27. 36.]
n_siblings_spouses  : [0 0 0 1 1]
parch               : [0 0 0 0 2]
fare                : [ 26.55   7.75  10.5   21.   120.  ]
class               : [b'First' b'Third' b'Second' b'Second' b'First']
deck                : [b'C' b'unknown' b'unknown' b'unknown' b'B']
embark_town         : [b'Southampton' b'Queenstown' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'y' b'y' b'y' b'n' b'n']


In [13]:
# データセットの読み込み方法２
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)
show_batch(temp_dataset)

sex                 : [b'female' b'male' b'male' b'female' b'male']
age                 : [58. 36. 28.  9. 30.]
n_siblings_spouses  : [0 0 0 2 0]
parch               : [0 0 0 2 0]
fare                : [146.521  26.388  13.     34.375   7.225]
class               : [b'First' b'First' b'Second' b'Third' b'Third']
deck                : [b'B' b'E' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Cherbourg']
alone               : [b'y' b'y' b'y' b'n' b'y']


In [14]:
# 特定ののカラムだけ読み込む方法
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS) #select_columnsで特定のカラムだけ抽出できる
show_batch(temp_dataset)

age                 : [19. 71. 21. 23. 30.]
n_siblings_spouses  : [0 0 0 2 0]
class               : [b'Third' b'First' b'Third' b'Second' b'Third']
deck                : [b'unknown' b'A' b'unknown' b'unknown' b'unknown']
alone               : [b'y' b'y' b'y' b'n' b'y']


In [15]:
# survivedがどっか行ってる、なんで？？
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                           select_columns=SELECT_COLUMNS,
                           column_defaults = DEFAULTS)
show_batch(temp_dataset)

age                 : [11. 44. 24. 36. 35.]
n_siblings_spouses  : [0. 0. 0. 1. 0.]
parch               : [0. 1. 0. 2. 0.]
fare                : [ 18.788  57.979  13.    120.     10.5  ]


In [17]:
# 数値列をベクトル化
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

In [18]:
packed_dataset = temp_dataset.map(pack)
for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[ 49.      1.      0.     89.104]
 [ 24.      0.      3.     19.258]
 [ 50.      0.      1.    247.521]
 [ 28.      0.      0.      7.879]
 [ 49.      1.      1.    110.883]]

[1 1 1 1 0]


In [19]:
show_batch(raw_train_data)

sex                 : [b'male' b'male' b'male' b'male' b'female']
age                 : [28. 28. 24. 23. 19.]
n_siblings_spouses  : [0 0 0 0 1]
parch               : [0 0 0 0 0]
fare                : [ 7.25   8.113  8.05  10.5   26.   ]
class               : [b'Third' b'Third' b'Third' b'Second' b'Second']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton' b'Southampton'
 b'Southampton']
alone               : [b'y' b'y' b'y' b'y' b'n']


In [21]:
example_batch, labels_batch = next(iter(temp_dataset))

In [23]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names

    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features

        return features, labels

In [24]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [25]:
show_batch(packed_train_data)

sex                 : [b'male' b'male' b'female' b'male' b'female']
class               : [b'Third' b'Third' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Cherbourg']
alone               : [b'y' b'y' b'n' b'y' b'y']
numeric             : [[20.     0.     0.     7.229]
 [24.     0.     0.     7.896]
 [18.     0.     1.     9.35 ]
 [55.5    0.     0.     8.05 ]
 [15.     0.     0.     7.225]]


In [26]:
example_batch, labels_batch = next(iter(packed_train_data))

## データの正規化