# evaluation

Evaluate both baseline and TabTransformer models with test set


In [1]:
import keras_preprocessing
from keras import layers
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

from pathlib import Path
import pandas as pd
import numpy as np
import os

import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC



In [2]:
# preproccessing

CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]


CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)[df.columns]

    return df_scaled


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_validation_data[: round(len(data_df) * 0.70)],
            seed,
            categorical_features=CATEGORICAL_FEATURE_NAMES,
        )

    return train_set, validation_set, test_set


def split_train_valid_test_stratified(data_df, seed: int, resample_training: bool):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=seed)
    d_x, d_y = split_label(data_df)

    train_index, valid_test_index = list(sss.split(d_x, d_y))[0]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    v_t_x, v_t_y = split_label(data_df.iloc[valid_test_index])

    validation_index, test_index = list(sss.split(v_t_x, v_t_y))[0]

    train_df = data_df.iloc[train_index.tolist()]
    validation_df = data_df.iloc[validation_index.tolist()]
    test_df = data_df.iloc[test_index.tolist()]

    if resample_training:
        train_df = resample(
            train_df, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_df, validation_df, test_df


def prepare_data(seed: int, resample_training: bool):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test_stratified(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return train_df, validation_df, test_df


In [3]:
# model hyperparameters

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.1
BATCH_SIZE = 256
NUM_EPOCHS = 100

MLP_MODEL_PATH = str(Path().resolve().joinpath("model/mlp_model"))
TABTRANSFORMER_MODEL_PATH = str(Path().resolve().joinpath("model/tabtransformer_model"))

TARGET_FEATURE_NAME = "stroke"
TARGET_LABELS = ["1", "0"]


In [4]:
# data proccessing pipeline

#target_label_lookup = layers.StringLookup(
#    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
#)

def prepare_example(features, target):
    # target_index = target_label_lookup(target)
    target_index = target
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    """dataset from, csv"""
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()


In [5]:
# training and evaluation


def train(
    model,
    train_data_file,
    test_data_file,
    model_output,
    num_epochs,
    EPOCHS_TO_WAIT_FOR_IMPROVE,
    learning_rate,
    batch_size,
):
    """Implement a training and evaluation procedure"""
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=WEIGHT_DECAY
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    metrics = [
            keras.metrics.BinaryAccuracy(name="acc"),
            keras.metrics.AUC(name="auc"),
        ]

    early_stop_callback = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    )
    # checkpoint_callback = keras.callbacks.ModelCheckpoint(model_output, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    # early_stop_callback = keras.callbacks.EarlyStopping(
    #    monitor="val_auc", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    # )

    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        model_output,
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )

    print("Start training the model...")

    history = model.fit(
        train_dataset,
        epochs=num_epochs,
        validation_data=validation_dataset,
        callbacks=[
            checkpoint_callback,
            early_stop_callback
        ],
    )

    print("Model training finished")

    model.save(model_output)

    _, accuracy, auc = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}% AUC: {auc}")

    return history, model


In [6]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

NUM_EXPERIMENTS = 10
EPOCHS_TO_WAIT_FOR_IMPROVE = 10

In [7]:
def run_experiment(
    experiment,
    tabtransformer_model,
    seed,
    train_data,
    validation_data,
    test_data,
    resampled: bool,
):
    # split labels
    train_data_file = str(
        Path().resolve().joinpath(f"dataset/train_data_tt_exp_{experiment}.csv")
    )
    validation_data_file = str(
        Path().resolve().joinpath(f"dataset/validation_data_tt_exp_{experiment}.csv")
    )
    test_data_file = str(
        Path().resolve().joinpath(f"dataset/test_data_tt_exp_{experiment}.csv")
    )

    train_data.to_csv(train_data_file, header=False, index=False)
    validation_data.to_csv(validation_data_file, header=False, index=False)
    test_data.to_csv(test_data_file, header=False, index=False)

    if resampled:
        model_output = str(
            Path().resolve().joinpath(f"model/tt_model_exp_{experiment}_resampled")
        )
    else:
        model_output = str(
            Path().resolve().joinpath(f"model/tt_model_exp_{experiment}_unsampled")
        )

    # train tabtransformer model on training data and evaluate on validation data
    history, tt_trained = train(
        model=tabtransformer_model,
        train_data_file=train_data_file,
        test_data_file=validation_data_file,
        model_output=model_output,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )

    train_validation_data_file = str(
        Path()
        .resolve()
        .joinpath(f"dataset/train_validation_data_tt_exp_{experiment}.csv")
    )

    pd.concat([validation_data, train_data]).sample(frac=1, random_state=seed).to_csv(
        train_validation_data_file, index=False, header=False
    )

    # now, train tabtransformer model on validation data and evaluate on test data
    history, tt_trained = train(
        model=tt_trained,
        train_data_file=train_validation_data_file,
        test_data_file=test_data_file,
        model_output=model_output,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )

    # cleanup
    os.remove(train_data_file)
    os.remove(validation_data_file)
    os.remove(train_validation_data_file)
    # os.remove(test_data_file)


In [8]:
# train model

tabtransformer_model = keras.models.load_model(TABTRANSFORMER_MODEL_PATH)

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    print(f"Experiment {experiment}, seed {seed}")

    """ 
    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=False
    )

    run_experiment(
        experiment=experiment,
        tabtransformer_model=tabtransformer_model,
        seed=seed,
        train_data=train_data,
        validation_data=validation_data,
        test_data=test_data,
        resampled=False,
    )
    """

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=True
    )

    run_experiment(
        experiment=experiment,
        tabtransformer_model=tabtransformer_model,
        seed=seed,
        train_data=train_data,
        validation_data=validation_data,
        test_data=test_data,
        resampled=True,
    )


2022-09-08 00:50:24.821778: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-09-08 00:50:24.821875: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: haoming-t480s
2022-09-08 00:50:24.821901: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: haoming-t480s
2022-09-08 00:50:24.822395: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2022-09-08 00:50:24.822492: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2022-09-08 00:50:24.822515: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.65.1
2022-09-08 00:50:24.823235: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-c

Experiment 0, seed 753


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    200/Unknown - 18s 47ms/step - loss: 0.2591 - acc: 0.9039 - auc: 0.9564
Epoch 1: val_loss improved from inf to 7.11451, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled




INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


Epoch 2/100
Epoch 2: val_loss improved from 7.11451 to 4.03554, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled




INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


Epoch 3/100
Epoch 3: val_loss improved from 4.03554 to 0.67729, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled




INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled/assets


Epoch 4/100
Epoch 4: val_loss improved from 0.67729 to 0.09735, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/tt_model_exp_0_resampled




In [None]:
def metrics_keras(model: keras.Model, test_data_file: str):
    model.compile(
        metrics=[
            keras.metrics.AUC(
                num_thresholds=200,
                curve="ROC",
            ),
            keras.metrics.BinaryAccuracy(),
            keras.metrics.Precision(),
            keras.metrics.Recall(),
            keras.metrics.TrueNegatives(),
            keras.metrics.FalsePositives(),
            keras.metrics.TrueNegatives(),
            keras.metrics.TruePositives(),
        ]
    )

    _, auc, accuracy, precision, recall, tn, fn, fp, tp = model.evaluate(
        get_dataset_from_csv(test_data_file)
    )

    # metrics
    fscore = 2 * tp / (2 * tp + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [None]:
# evaluate unsampled
results = {}
results_resampling = {}

test_data = {}


for file in Path().resolve().joinpath("dataset/").iterdir():
    if str(file.name).startswith("test_data_tt_exp_"):
        exp_num = file.name.split("_")[4][0]

        test_data[exp_num] = str(file)

for file in Path().resolve().joinpath("model/").iterdir():
    if str(file).endswith("_unsampled") and str(file.name).startswith("tt_model_exp_"):
        exp_num = file.name.split("_")[3]

        model = tf.keras.models.load_model(file)

        results[exp_num] = list(metrics_keras(model, test_data[exp_num]))

    if str(file).endswith("_resampled") and str(file.name).startswith("tt_model_exp_"):
        exp_num = file.name.split("_")[3]

        model = tf.keras.models.load_model(file)

        results_resampling[exp_num] = list(metrics_keras(model, test_data[exp_num]))


results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)
results_resampling = pd.DataFrame().from_dict(
    results_resampling, orient="index", columns=RESULT_COLS
)

results["classifier"] = "TabTransformer"
results_resampling["classifier"] = "TabTransformer"

results.to_csv(Path().resolve().joinpath("results/TabTransformer_eval.csv"))
results_resampling.to_csv(
    Path().resolve().joinpath("results/TabTransformer_eval_resampled.csv")
)


