In [2]:
# !pip install -q numpy pandas tensorflow

## references
# https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
# https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/code
# https://www.kaggle.com/code/ryancmcv/mlb-pitch-data
# https://stackoverflow.com/questions/64689483/how-to-do-multiclass-classification-with-keras

# to_ordinal https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_ordinal
# to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

# TODO filter features that fewer than n instances (ie pitchers who only pitched a game or so.) n=5 maybe?


import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import StringLookup

tf.__version__

2023-08-22 18:58:35.167368: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 18:58:35.209105: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-22 18:58:35.210374: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'2.13.0'

In [3]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
df = pd.read_csv(file_url)

val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

Using 242 samples for training and 61 for validation


In [4]:

def dataframe_to_dataset(dataframe):
    copied_dataframe = dataframe.copy()
    labels = copied_dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(copied_dataframe), labels))
    ds = ds.shuffle(buffer_size=len(copied_dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)


for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

    

Input: {'age': <tf.Tensor: shape=(), dtype=int64, numpy=57>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'cp': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'trestbps': <tf.Tensor: shape=(), dtype=int64, numpy=130>, 'chol': <tf.Tensor: shape=(), dtype=int64, numpy=236>, 'fbs': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'restecg': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'thalach': <tf.Tensor: shape=(), dtype=int64, numpy=174>, 'exang': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'oldpeak': <tf.Tensor: shape=(), dtype=float64, numpy=0.0>, 'slope': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'ca': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'thal': <tf.Tensor: shape=(), dtype=string, numpy=b'2'>}
Target: tf.Tensor(0, shape=(), dtype=int64)


In [5]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)


train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

Input: {'age': <tf.Tensor: shape=(), dtype=int64, numpy=57>, 'sex': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'cp': <tf.Tensor: shape=(), dtype=int64, numpy=4>, 'trestbps': <tf.Tensor: shape=(), dtype=int64, numpy=140>, 'chol': <tf.Tensor: shape=(), dtype=int64, numpy=192>, 'fbs': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'restecg': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'thalach': <tf.Tensor: shape=(), dtype=int64, numpy=148>, 'exang': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'oldpeak': <tf.Tensor: shape=(), dtype=float64, numpy=0.4>, 'slope': <tf.Tensor: shape=(), dtype=int64, numpy=2>, 'ca': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'thal': <tf.Tensor: shape=(), dtype=string, numpy=b'fixed'>}
Target: tf.Tensor(0, shape=(), dtype=int64)


In [6]:

from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup


def encode_feature_with_normalizer(feature, name, dataset, normalizer):
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    normalizer.adapt(feature_ds)
    encoded_feature = normalizer(feature)
    return encoded_feature



In [10]:

all_feature_layers, all_inputs = [], []

feature_types = {
    "int64" : {
        "features" : ["sex", "cp", "fbs", "restecg", "exang", "ca",],
        "normalizer_provider" : lambda : IntegerLookup(output_mode="binary")
    },
    "string" : {
        "features" : ["thal",],
        "normalizer_provider" : lambda : StringLookup(output_mode="binary")
    },
    "float32" : {
        "features" : ["age", "trestbps", "chol", "thalach", "oldpeak", "slope", ],
        "normalizer_provider" : lambda : Normalization()
    },
}

feature_inputs_combined, feature_layers_combined = [], []

for feature_type, feature_info in feature_types.items():
    for feature_name in feature_info["features"]:
        feature_input = keras.Input(shape=(1,), name=feature_name, dtype=feature_type)
        feature_inputs_combined.append(feature_input)
        normalizer = feature_info["normalizer_provider"]()
        print("encoding %s: %s using %s" %(feature_type, feature_name, normalizer))        
        feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
        feature_layers_combined.append(feature_encoded)

# for feature_name in ["sex", "cp", "fbs", "restecg", "exang", "ca",]:
#     feature_input = keras.Input(shape=(1,), name=feature_name, dtype="int64")
#     all_inputs.append(feature_input)
#     normalizer = IntegerLookup(output_mode="binary")
#     feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
#     all_feature_layers.append(feature_encoded)

# string_categorial_features = ["thal"]
# for feature_name in string_categorial_features:
#     feature_input = keras.Input(shape=(1,), name=feature_name, dtype="string")
#     all_inputs.append(feature_input)
#     normalizer = StringLookup(output_mode="binary")
#     feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
#     all_feature_layers.append(feature_encoded)

# numeric_categorial_features = ["age", "trestbps", "chol", "thalach", "oldpeak", "slope", ]
# for feature_name in numeric_categorial_features:
#     feature_input = keras.Input(shape=(1,), name=feature_name, dtype="float32")
#     all_inputs.append(feature_input)
#     normalizer = Normalization()
#     feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
#     all_feature_layers.append(feature_encoded)


all_features = layers.concatenate(feature_layers_combined)
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])



encoding int64: sex using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85e42ed250>
encoding int64: cp using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85e4599730>
encoding int64: fbs using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85ec618850>
encoding int64: restecg using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85ccebb1f0>
encoding int64: exang using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85cced5e20>
encoding int64: ca using <keras.src.layers.preprocessing.integer_lookup.IntegerLookup object at 0x7c85cceeb730>
encoding string: thal using <keras.src.layers.preprocessing.string_lookup.StringLookup object at 0x7c85ccef4910>
encoding float32: age using <keras.src.layers.preprocessing.normalization.Normalization object at 0x7c85cce8e1f0>
encoding float32: trestbps using <keras.src.layers.preprocessing.normalization.Normalizatio

ValueError: A `Concatenate` layer should be called on a list of at least 1 input. Received: input_shape=()

In [None]:
model.fit(train_ds, epochs=50, validation_data=val_ds)


In [None]:
sample = {
    "age": 55,
    "sex": 1,
    "cp": 1,
    "trestbps": 145,
    "chol": 210,
    "fbs": 1,
    "restecg": 2,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 2.3,
    "slope": 3,
    "ca": 0,
    "thal": "fixed",
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)

print(
    "This particular patient had a %.1f percent probability "
    "of having a heart disease, as evaluated by our model." % (100 * predictions[0][0],)
)

predictions
