In [None]:
# !pip install -q numpy pandas tensorflow

## references
# https://keras.io/examples/structured_data/structured_data_classification_from_scratch/
# https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/code
# https://www.kaggle.com/code/ryancmcv/mlb-pitch-data
# https://stackoverflow.com/questions/64689483/how-to-do-multiclass-classification-with-keras

# to_ordinal https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_ordinal
# to_categorical https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

# TODO filter features that fewer than n instances (ie pitchers who only pitched a game or so.) n=5 maybe?


import os

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from keras import layers
from keras.layers import Normalization
from keras.layers import IntegerLookup
from keras.layers import StringLookup

tf.__version__

In [None]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
df = pd.read_csv(file_url)

val_dataframe = df.sample(frac=0.2, random_state=1337)
train_dataframe = df.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

In [None]:

def dataframe_to_dataset(dataframe):
    copied_dataframe = dataframe.copy()
    labels = copied_dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(copied_dataframe), labels))
    ds = ds.shuffle(buffer_size=len(copied_dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)


for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

    

In [None]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)


train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [None]:


def encode_feature_with_normalizer(feature, name, dataset, normalizer):
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
    
    normalizer.adapt(feature_ds)
    encoded_feature = normalizer(feature)
    return encoded_feature



In [None]:

feature_types = {
    "int64" : {
        "features" : ["sex", "cp", "fbs", "restecg", "exang", "ca",],
        "normalizer_provider" : lambda : IntegerLookup(output_mode="binary")
    },
    "string" : {
        "features" : ["thal",],
        "normalizer_provider" : lambda : StringLookup(output_mode="binary")
    },
    "float32" : {
        "features" : ["age", "trestbps", "chol", "thalach", "oldpeak", "slope", ],
        "normalizer_provider" : lambda : Normalization()
    },
}

feature_inputs_combined, feature_layers_combined = [], []

for feature_type, feature_info in feature_types.items():
    for feature_name in feature_info["features"]:
        feature_input = keras.Input(shape=(1,), name=feature_name, dtype=feature_type)
        feature_inputs_combined.append(feature_input)

        normalizer = feature_info["normalizer_provider"]()
        feature_encoded = encode_feature_with_normalizer(feature_input, feature_name, train_ds, normalizer)
        feature_layers_combined.append(feature_encoded)


all_features = layers.concatenate(feature_layers_combined)
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(feature_inputs_combined, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])



In [None]:
model.fit(train_ds, epochs=50, validation_data=val_ds)


In [None]:
sample = {
    "age": 55,
    "sex": 1,
    "cp": 1,
    "trestbps": 145,
    "chol": 210,
    "fbs": 1,
    "restecg": 2,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 2.3,
    "slope": 3,
    "ca": 0,
    "thal": "fixed",
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)

print(
    "This particular patient had a %.1f percent probability "
    "of having a heart disease, as evaluated by our model." % (100 * predictions[0][0],)
)

predictions
