# evaluation

Evaluate both baseline and TabTransformer models with test set

In [3]:
import keras_preprocessing, keras
from keras import layers
import tensorflow as tf

from pathlib import Path
import pandas as pd

In [4]:
def split_label(data: pd.DataFrame):
    x = data.copy().drop('stroke', axis=1)
    y = data["stroke"]  # labels

    return x, y

In [5]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "Residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]

FEATURES = CSV_HEADER[:-1]
TARGET = CSV_HEADER[-1]

test_data_path = Path().resolve().joinpath("dataset/test_data.csv")
test_data_file = str(test_data_path.absolute())
test_data = pd.read_csv(test_data_file, names=CSV_HEADER)

x_test, y_test = split_label(test_data)

y_test = y_test.replace({"No": 0, 'Yes': 1})

x_test = x_test
y_test = y_test


In [6]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.1
BATCH_SIZE = 32
NUM_EPOCHS = 100

MLP_MODEL_PATH=str(Path().resolve().joinpath('model/mlp_model'))
TABTRANSFORMER_MODEL_PATH=str(Path().resolve().joinpath('model/tabtransformer_model'))

TARGET_FEATURE_NAME='stroke'
TARGET_LABELS = ["Yes", "No"]

In [7]:
# data proccessing pipeline

target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    target_index = target_label_lookup(target)
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    """dataset from, csv"""
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()


  return bool(asarray(a1 == a2).all())
2022-09-01 20:20:21.960019: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-01 20:20:21.960706: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-01 20:20:21.961617: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-01 20:20:21.961874: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-09-01 20:20:21.962042: W tensorflow/stream_executor/platform

In [8]:
def evalate_model(model: keras.Model, test_data_file):
    test_data = get_dataset_from_csv(test_data_file)

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[
            tf.keras.metrics.AUC(
                num_thresholds=200,
                curve="ROC",
                summation_method="interpolation",
                name="auc",
            ),
            tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        ]
    )

    model.evaluate(
        x=test_data,
        batch_size=BATCH_SIZE,
        verbose="auto",
        steps=None,
        callbacks=None,
        max_queue_size=10,
        workers=1,
        use_multiprocessing=False,
        return_dict=False,
    )


In [9]:
baseline_model = keras.models.load_model(MLP_MODEL_PATH)
tt_model = keras.models.load_model(TABTRANSFORMER_MODEL_PATH)

evalate_model(baseline_model, test_data_file)
evalate_model(tt_model, test_data_file)



### MLP:
loss: 0.7477 - auc: 0.6128 - accuracy: 0.9127

### TabTransformer:
loss: 0.7546 - auc: 0.5282 - accuracy: 0.9044