In [12]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


### Use pandas to import and load into a dataframe

In [5]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


#### Split dataframe into training, validation and test sets.

In [45]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


#### Input pipeline

In [46]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

##### using small bactch size for demonstration

In [47]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [48]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['age'])
  print('A batch of targets:', label_batch )

Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([35 60 64 52 65], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int32)


##### Using the small batch to demo several types of feature columns

In [49]:
example_batch = next(iter(train_ds))[0]

##### Util func for creating feature column and transform batch of data

In [50]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

##### Numeric column.  Simplest.  Represents real valued features.  Recives value from dataframe unchanged

In [51]:
age = feature_column.numeric_column("age")
demo(age)

[[35.]
 [60.]
 [64.]
 [52.]
 [65.]]


##### Bucketized column.

In [52]:
age_buckets = feature_column.bucketized_column(age, boundaries=[18,25,30,35,40,45,50,55,60,65])
demo(age_buckets)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


##### Categorical column.  "thal" is represented as a string.  We need to map it to a numerical value in order to pass it to the model. 

In [53]:
thal = feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible'])

thal_one_hot = feature_column.indicator_column(thal)
demo(thal_one_hot)

[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


##### Embedding.  Can't use one-hot on large numbers of categories.  Embedding is good in this situation.

In [54]:
#input is from the categorical column from above ^^^
thal_embedding = feature_column.embedding_column(thal, dimension=8)
demo(thal_embedding)

[[-0.3373227  -0.2863546   0.4565269  -0.2883136  -0.35964465  0.60509735
   0.19051819 -0.11198549]
 [ 0.3694477   0.50644326 -0.52974164  0.32144442  0.42678428 -0.01582981
  -0.36653075 -0.17926893]
 [ 0.3694477   0.50644326 -0.52974164  0.32144442  0.42678428 -0.01582981
  -0.36653075 -0.17926893]
 [ 0.3694477   0.50644326 -0.52974164  0.32144442  0.42678428 -0.01582981
  -0.36653075 -0.17926893]
 [ 0.3694477   0.50644326 -0.52974164  0.32144442  0.42678428 -0.01582981
  -0.36653075 -0.17926893]]


##### Hashed column 

In [55]:
#Downside can be collision from diff strings mapped to same bucket. Can still work well regardless.
#Make bucket size well larger than num of categories.
thal_hashed = feature_column.categorical_column_with_hash_bucket(
    'thal', hash_bucket_size=1000)
demo(feature_column.indicator_column(thal_hashed))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


##### Feature crosses.  Combine features into a single feature.  Model can learn separate weights for each combo.

In [56]:
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
demo(feature_column.indicator_column(crossed_feature))
#import sys
#print(sys.maxsize)

SystemError: <built-in function TFE_Py_FastPathExecute> returned a result with an error set