# Heart Attack Risk Prediciton

In [None]:
#import library
import tensorflow as tf
from tfx.components import CsvExampleGen, StatisticsGen, SchemaGen, ExampleValidator, Transform, Trainer, Tuner
from tfx.proto import example_gen_pb2
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
import os
import pandas as pd

## Clean Dataset

In [None]:
df = pd.read_csv('dataset/heart_attack_dataset.csv')

if 'Patient ID' in df.columns:
    df.drop(columns=['Patient ID', 'Sex','Blood Pressure', 'Sedentary Hours Per Day', 'BMI', 'Country', 'Continent', 'Hemisphere', 'Exercise Hours Per Week', 'Diet'], inplace=True)

df.head()

In [None]:
df.to_csv('clean_dataset/heart_attack_clean.csv', index=False)

In [None]:
df.info()

## Set Variable

In [None]:
PIPELINE_NAME = "hanhanhanny-pipeline"
SCHEMA_PIPELINE_NAME = "heart-attack-schema"

# Directory to store pipeline artifacts
PIPELINE_ROOT = os.path.join('pipelines', PIPELINE_NAME)

# Path to a SQLite DB file for MLMD storage
METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')

# Directory to export created models
SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

In [None]:
# Initialize Interactive Context
DATA_ROOT = "clean_dataset"  # the data is placed in the "clean_dataset" directory
interactive_context = InteractiveContext(pipeline_root=PIPELINE_ROOT)

## Data Ingestion

In [None]:
output = example_gen_pb2.Output(
    split_config = example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=8),
        example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=2)
    ])
)
example_gen = CsvExampleGen(input_base=DATA_ROOT, output_config=output)

In [None]:
interactive_context.run(example_gen)

## Data Validation

In [None]:
statistics_gen = StatisticsGen(
    examples = example_gen.outputs["examples"]
)
interactive_context.run(statistics_gen)

In [None]:
interactive_context.show(statistics_gen.outputs["statistics"])

In [None]:
schema_gen = SchemaGen(    statistics=statistics_gen.outputs["statistics"]
)
interactive_context.run(schema_gen)

In [None]:
interactive_context.show(schema_gen.outputs["schema"])

In [None]:
example_validator = ExampleValidator(
    statistics=statistics_gen.outputs["statistics"],
    schema=schema_gen.outputs["schema"]
)

interactive_context.run(example_validator)

In [None]:
interactive_context.show(example_validator.outputs["anomalies"])

## Data Preprocessing

In [None]:
TRANSFORM_MODULE_FILE = "heart_attack_transform.py"

In [None]:
%%writefile {TRANSFORM_MODULE_FILE}

import tensorflow as tf
import tensorflow_transform as tft
import os

# Set a custom temporary directory
os.environ['TF_TFT_TMP_DIR'] = '/path/to/your/temp/dir'

def transformed_key(key):
    """Renaming transformed features"""
    return key + "_xf"

def preprocessing_fn(inputs):
    """
    Preprocess input features into transformed features
    
    Args:
        inputs: map from feature keys to raw features.
    
    Return:
        outputs: map from feature keys to transformed features.    

    Description:
        - apply one hot encoding to categorical features
        - apply standardization to float features and int features that are not binary
        - apply renaming of transformed features except for one hot encoded features
    """
    
    outputs = {}

    # Standardize numerical features
    outputs[transformed_key("Age")] = tf.cast(inputs["Age"], tf.int64)
    outputs[transformed_key("Cholesterol")] = tft.scale_to_0_1(inputs["Cholesterol"])
    outputs[transformed_key("Triglycerides")] = tft.scale_to_0_1(inputs["Triglycerides"])
    outputs[transformed_key("Income")] = tft.scale_to_0_1(inputs["Income"])
    outputs[transformed_key("Heart_Rate")] = tft.scale_to_0_1(inputs["Heart Rate"])
    outputs[transformed_key("Stress_Level")] = tft.scale_to_0_1(inputs["Stress Level"])
    outputs[transformed_key("Physical_Activity_Days_Per_Week")] = tft.scale_to_0_1(inputs["Physical Activity Days Per Week"])
    outputs[transformed_key("Sleep_Hours_Per_Day")] = tft.scale_to_0_1(inputs["Sleep Hours Per Day"])

    # Binary features (no transformation)
    outputs["Smoking"] = inputs["Smoking"]
    outputs["Diabetes"] = inputs["Diabetes"]
    outputs["Family_History"] = inputs["Family History"]
    outputs["Obesity"] = inputs["Obesity"]
    outputs["Alcohol_Consumption"] = inputs["Alcohol Consumption"]
    outputs["Previous_Heart_Problems"] = inputs["Previous Heart Problems"]
    outputs["Medication_Use"] = inputs["Medication Use"]

    # Target feature
    outputs["Heart_Attack_Risk"] = tf.cast(inputs["Heart Attack Risk"], tf.int64)

    return outputs

In [None]:
transform = Transform(
    examples=example_gen.outputs["examples"],
    schema=schema_gen.outputs["schema"],
    module_file=os.path.abspath(TRANSFORM_MODULE_FILE)
)
interactive_context.run(transform)

## Model Training

In [None]:
TRAINER_TUNER_MODULE_FILE = "heart_attack_trainer.py"

In [None]:
%%writefile {TRAINER_TUNER_MODULE_FILE}

import tensorflow as tf
import tensorflow_transform as tft 
from tensorflow.keras import layers
import os
from keras_tuner.engine import base_tuner
from keras_tuner import RandomSearch, HyperParameters
from tfx.components.trainer.fn_args_utils import FnArgs
from typing import NamedTuple, Dict, Text, Any

LABEL_KEY = "Heart Attack Risk"

def transformed_name(key):
    """Renaming transformed features"""
    return key + "_xf"

def gzip_reader_fn(filenames):
    """Loads compressed data"""
    return tf.data.TFRecordDataset(filenames, compression_type='GZIP')

def get_hyperparameters() -> HyperParameters:
    """Returns hyperparameters for building model"""
    hp = HyperParameters()
    hp.Int('units', min_value=32, max_value=512, step=32, default=128)
    hp.Int('num_layers', min_value=1, max_value=4, step=1, default=3)
    hp.Float('learning_rate', min_value=1e-2, max_value=1e-1, sampling='LOG', default=1e-2)
    return hp

def input_fn(file_pattern, 
             tf_transform_output,
             num_epochs=None,
             batch_size=64)->tf.data.Dataset:
    """Get post_transform feature & create batches of data"""
    
    # Get post_transform feature spec
    transform_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())
    
    # create batches of data
    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transform_feature_spec,
        reader=gzip_reader_fn,
        num_epochs=num_epochs,
        label_key=LABEL_KEY)
    dataset = dataset.shuffle(buffer_size=10000)
    return dataset

def model_builder(hparams: HyperParameters):
    """Build machine learning model"""
    inputs = {
        transformed_name('Age'): tf.keras.Input(shape=(1,), name=transformed_name('Age'), dtype=tf.int64),
        transformed_name('Cholesterol'): tf.keras.Input(shape=(1,), name=transformed_name('Cholesterol'), dtype=tf.int64),
        transformed_name('Triglycerides'): tf.keras.Input(shape=(1,), name=transformed_name('Triglycerides'), dtype=tf.int64),
        transformed_name('Income'): tf.keras.Input(shape=(1,), name=transformed_name('Income'), dtype=tf.int64),
        transformed_name('Heart_Rate'): tf.keras.Input(shape=(1,), name=transformed_name('Heart_Rate'), dtype=tf.int64),
        transformed_name('Stress_Level'): tf.keras.Input(shape=(1,), name=transformed_name('Stress_Level'), dtype=tf.int64),
        transformed_name('Physical_Activity_Days_Per_Week'): tf.keras.Input(shape=(1,), name=transformed_name('Physical_Activity_Days_Per_Week'), dtype=tf.int64),
        transformed_name('Sleep_Hours_Per_Day'): tf.keras.Input(shape=(1,), name=transformed_name('Sleep_Hours_Per_Day'), dtype=tf.int64),
        'Smoking': tf.keras.Input(shape=(1,), name='Smoking', dtype=tf.int64),
        'Diabetes': tf.keras.Input(shape=(1,), name='Diabetes', dtype=tf.int64),
        'Family_History': tf.keras.Input(shape=(1,), name='Family_History', dtype=tf.int64),
        'Obesity': tf.keras.Input(shape=(1,), name='Obesity', dtype=tf.int64),
        'Alcohol_Consumption': tf.keras.Input(shape=(1,), name='Alcohol_Consumption', dtype=tf.int64),
        'Previous_Heart_Problems': tf.keras.Input(shape=(1,), name='Previous_Heart_Problems', dtype=tf.int64),
        'Medication_Use': tf.keras.Input(shape=(1,), name='Medication_Use', dtype=tf.int64)
    }
    
    # Combine all inputs into a single tensor
    concatenated_inputs = layers.Concatenate()(list(inputs.values()))

    x = layers.Dense(hparams.get('units'), activation='relu')(concatenated_inputs)
    for _ in range(hparams.get('num_layers') - 1):
        x = layers.Dense(hparams.get('units') // 2, activation='relu')(x)
        x = layers.Dropout(0.5)(x)
    x = layers.BatchNormalization()(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=hparams.get('learning_rate')),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )
    
    model.summary()
    return model

# Tuner component will run this function
TunerFnResult = NamedTuple('TunerFnResult', [('tuner', RandomSearch),
                                             ('fit_kwargs', Dict[Text, Any])])

def tuner_fn(fn_args: FnArgs) -> TunerFnResult:
    """
    Build the tuner using the KerasTuner API.
    Args:
        fn_args: Holds args as name/value pairs.
        - working_dir: working dir for tuning.
        - train_files: List of file paths containing training tf.Example data.
        - eval_files: List of file paths containing eval tf.Example data.
        - train_steps: number of train steps.
        - eval_steps: number of eval steps.
        - schema_path: optional schema of the input data.
        - transform_graph_path: optional transform graph produced by TFT.
    Returns:
        A namedtuple contains the following:
        - tuner: A RandomSearch tuner that will be used for tuning.
        - fit_kwargs: Args to pass to tuner's run_trial function for fitting the
                        model, e.g., the training and validation dataset. Required
                        args depend on the above tuner's implementation.
    """
    hp = get_hyperparameters()
    # Define tuner
    tuner = RandomSearch(
        model_builder,
        objective='val_binary_accuracy',
        max_trials=30,
        directory=fn_args.working_dir,
        project_name='heart_attack_risk_classification',
        hyperparameters=hp
    )

    tf_transform_output = tft.TFTransformOutput(fn_args.transform_graph_path)

    train_set = input_fn(fn_args.train_files, tf_transform_output, 10)
    eval_set = input_fn(fn_args.eval_files, tf_transform_output, 10)

    return TunerFnResult(
        tuner=tuner,
        fit_kwargs={
            'x': train_set,
            'validation_data': eval_set
        }
    )

def _get_serve_tf_examples_fn(model, tf_transform_output):
    model.tft_layer = tf_transform_output.transform_features_layer()
    
    @tf.function
    def serve_tf_examples_fn(serialized_tf_examples):
        feature_spec = tf_transform_output.raw_feature_spec()
        feature_spec.pop(LABEL_KEY)
        parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
        transformed_features = model.tft_layer(parsed_features)
        return model(transformed_features)
        
    return serve_tf_examples_fn

def run_fn(fn_args: FnArgs) -> None:
    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, update_freq='batch'
    )
    
    es = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', mode='max', verbose=1, patience=10)
    mc = tf.keras.callbacks.ModelCheckpoint(fn_args.serving_model_dir, monitor='val_binary_accuracy', mode='max', verbose=1, save_best_only=True)
    
    # Load the transform output
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_graph_path)
    
    # Create batches of data
    train_set = input_fn(fn_args.train_files, tf_transform_output, num_epochs=10)
    eval_set = input_fn(fn_args.eval_files, tf_transform_output, num_epochs=10)

    hp = get_hyperparameters()
    model = model_builder(hp)
    
    # Train the model
    model.fit(
        x=train_set,
        validation_data=eval_set,
        callbacks=[tensorboard_callback, es, mc],
        steps_per_epoch=100, 
        validation_steps=100,
        epochs=10
    )
    
    signatures = {
        'serving_default': _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(
            tf.TensorSpec(
                shape=[None],
                dtype=tf.string,
                name='examples'))
    }
    
    model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)


In [None]:
from tfx.components import Tuner
from tfx.proto import trainer_pb2
 
tuner = Tuner(
    module_file=os.path.abspath(TRAINER_TUNER_MODULE_FILE),
    examples=transform.outputs['transformed_examples'],
    transform_graph=transform.outputs['transform_graph'],
    schema=schema_gen.outputs['schema'],
    train_args=trainer_pb2.TrainArgs(splits=['train'], num_steps=500),
    eval_args=trainer_pb2.EvalArgs(splits=['eval'], num_steps=100)
    )
 
interactive_context.run(tuner)

In [None]:
from tfx.proto import trainer_pb2
 
trainer = Trainer(
    module_file = os.path.abspath(TRAINER_TUNER_MODULE_FILE),
    examples = transform.outputs['transformed_examples'],
    transform_graph = transform.outputs['transform_graph'],
    schema = schema_gen.outputs['schema'],
    hyperparameters = tuner.outputs['best_hyperparameters'],
    train_args = trainer_pb2.TrainArgs(splits=['train']),
    eval_args = trainer_pb2.EvalArgs(splits=['eval'])
)

interactive_context.run(trainer)