In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
tf.enable_eager_execution()

### Load data

In [3]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


###  Split train, valid, test

In [4]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


### Create Dataset

In [5]:
def df_to_dataset(df, label, shuffle=True, batch_size=32):
    df = df.copy()
    target = df.pop(label)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), target))
    ds = ds.cache()
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [6]:
tr_ds = df_to_dataset(train, 'target')
vl_ds = df_to_dataset(train, 'target', shuffle=False)
ts_ds = df_to_dataset(train, 'target', shuffle=False)

### Create Feature Columns

In [7]:
feature_columns = []

# 数值列
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
    feature_columns.append(feature_column.numeric_column(header))

# 分桶列
age = feature_column.numeric_column("age")
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# 分类列
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# 嵌入列
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# 组合列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

### Model

#### feature column 进 keras

In [8]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [9]:
model = tf.keras.Sequential([
  feature_layer,
    layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

In [10]:
loss_object = tf.keras.losses.BinaryCrossentropy()

optimizer = tf.keras.optimizers.Adam()

In [22]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [23]:
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        labels = labels[:, tf.newaxis]
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)
    
@tf.function
def test_step(images, labels):
    predictions = model(images)
    labels = labels[:, tf.newaxis]
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)    

In [24]:
EPOCHS = 5

for epoch in range(EPOCHS):
    # 在下一个epoch开始时，重置评估指标
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

    for images, labels in tr_ds:
        train_step(images, labels)
    
    for images, labels in vl_ds:
        test_step(images, labels) 

    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print (template.format(epoch+1,
                         train_loss.result(),
                         train_accuracy.result()*100,
                         test_loss.result(),
                         test_accuracy.result()*100))

Epoch 1, Loss: 0.396497905254364, Accuracy: 78.75647735595703, Test Loss: 0.4110126793384552, Test Accuracy: 81.86528778076172
Epoch 2, Loss: 0.38033920526504517, Accuracy: 79.27461242675781, Test Loss: 0.3959250748157501, Test Accuracy: 81.34715270996094
Epoch 3, Loss: 0.3548123240470886, Accuracy: 80.31088256835938, Test Loss: 0.39764806628227234, Test Accuracy: 81.86528778076172
Epoch 4, Loss: 0.34973788261413574, Accuracy: 80.82901763916016, Test Loss: 0.37926673889160156, Test Accuracy: 82.90155792236328
Epoch 5, Loss: 0.3433705270290375, Accuracy: 82.90155792236328, Test Loss: 0.36969998478889465, Test Accuracy: 82.90155792236328
