In [23]:
import keras_preprocessing
from keras import layers
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

from pathlib import Path
import os
import pandas as pd
import numpy as np

gpu_number = 0  #### GPU number
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    tf.config.experimental.set_visible_devices(gpus[gpu_number], "GPU")
    logical_gpus = tf.config.experimental.list_logical_devices("GPU")
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")


2 Physical GPUs, 1 Logical GPU


In [24]:
import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC


In [25]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]


CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)[df.columns]

    return df_scaled


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_validation_data[: round(len(data_df) * 0.70)],
            seed,
            categorical_features=CATEGORICAL_FEATURE_NAMES,
        )

    return train_set, validation_set, test_set


def split_train_valid_test_stratified(data_df, seed: int, resample_training: bool):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=seed)
    d_x, d_y = split_label(data_df)

    train_index, valid_test_index = list(sss.split(d_x, d_y))[0]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    v_t_x, v_t_y = split_label(data_df.iloc[valid_test_index])

    validation_index, test_index = list(sss.split(v_t_x, v_t_y))[0]

    train_df = data_df.iloc[train_index.tolist()]
    validation_df = data_df.iloc[validation_index.tolist()]
    test_df = data_df.iloc[test_index.tolist()]

    if resample_training:
        train_df = resample(
            train_df, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_df, validation_df, test_df



def prepare_data(seed: int, resample_training: bool = False):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test_stratified(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return train_df, validation_df, test_df

In [26]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.1
BATCH_SIZE = 256
NUM_EPOCHS = 100

MLP_MODEL_PATH = str(Path().resolve().joinpath("model/mlp_model"))
TABTRANSFORMER_MODEL_PATH = str(Path().resolve().joinpath("model/tabtransformer_model"))

TARGET_FEATURE_NAME = "stroke"
TARGET_LABELS = ["1", "0"]


In [27]:
# data proccessing pipeline

target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    # target_index = target_label_lookup(target)
    target_index = target
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    """dataset from, csv"""
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()


def get_dataset_from_df(df, batch_size=128, shuffle=False):
    """dataset from, csv"""

    dataset = tf.data.Dataset.from_tensor_slices(
        (
            df[NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES].to_dict(
                orient="list"
            ),
            tf.constant(df.loc[:, TARGET_FEATURE_NAME]),
        )
    )

    return dataset.cache()


get_dataset_from_df(pd.read_csv("dataset/full_data_clean.csv"))

# tf.convert_to_tensor(pd.read_csv("dataset/full_data_clean.csv").loc[:, TARGET_FEATURE_NAME])
# get_dataset_from_csv("dataset/full_data_clean.csv")


  return bool(asarray(a1 == a2).all())


<CacheDataset element_spec=({'age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'avg_glucose_level': TensorSpec(shape=(), dtype=tf.float32, name=None), 'bmi': TensorSpec(shape=(), dtype=tf.float32, name=None), 'gender': TensorSpec(shape=(), dtype=tf.string, name=None), 'hypertension': TensorSpec(shape=(), dtype=tf.string, name=None), 'heart_disease': TensorSpec(shape=(), dtype=tf.string, name=None), 'ever_married': TensorSpec(shape=(), dtype=tf.string, name=None), 'work_type': TensorSpec(shape=(), dtype=tf.string, name=None), 'residence_type': TensorSpec(shape=(), dtype=tf.string, name=None), 'smoking_status': TensorSpec(shape=(), dtype=tf.string, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [28]:
# training and evaluation


def train(
    model: keras.Model,
    train_data_file,
    test_data_file,
    model_output,
    num_epochs,
    EPOCHS_TO_WAIT_FOR_IMPROVE,
    learning_rate,
    batch_size,
):
    """Implement a training and evaluation procedure"""
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=WEIGHT_DECAY
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    metrics = (
        [
            keras.metrics.BinaryAccuracy(name="acc"),
            keras.metrics.AUC(name="auc"),
        ],
    )

    early_stop_callback = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    )
    # checkpoint_callback = keras.callbacks.ModelCheckpoint(model_output, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    # early_stop_callback = keras.callbacks.EarlyStopping(
    #    monitor="val_auc", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    # )

    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        model_output,
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
        mode="min",
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics,
    )

    print("Start training the model...")

    history = model.fit(
        train_dataset,
        epochs=num_epochs,
        validation_data=validation_dataset,
        callbacks=[
            checkpoint_callback,
            early_stop_callback
        ],
    )

    print("Model training finished")

    _, accuracy, auc = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}% AUC: {auc}")

    return history, model


In [29]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

NUM_EXPERIMENTS = 10
EPOCHS_TO_WAIT_FOR_IMPROVE = 10


In [30]:
def run_experiment(experiment, mlp_model, seed, train_data, validation_data, test_data, resampled: bool):
    # split labels
    train_data_path = str(
        Path().resolve().joinpath(f"dataset/train_data_mlp_exp_{experiment}.csv")
    )
    validation_data_path = str(
        Path().resolve().joinpath(f"dataset/validation_data_mlp_exp_{experiment}.csv")
    )
    test_data_path = str(
        Path().resolve().joinpath(f"dataset/test_data_mlp_exp_{experiment}.csv")
    )

    train_data.to_csv(train_data_path, header=False, index=False)
    validation_data.to_csv(validation_data_path, header=False, index=False)
    test_data.to_csv(test_data_path, header=False, index=False)

    x_test, y_test = split_label(test_data)

    if resampled:
        model_output = str(
            Path().resolve().joinpath(f"model/mlp_model_exp_{experiment}_resampled")
        )
    else:
        model_output = str(
            Path().resolve().joinpath(f"model/mlp_model_exp_{experiment}_unsampled")
        )

    # train tabtransformer model on training data and evaluate on validation data
    history, mlp_trained = train(
        model=mlp_model,
        train_data_file=train_data_path,
        test_data_file=validation_data_path,
        model_output=model_output,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )


    train_validation_data_path = str(
        Path()
        .resolve()
        .joinpath(f"dataset/train_validation_data_mlp_exp_{experiment}.csv")
    )

    pd.concat([validation_data]).sample(frac=1, random_state=seed).to_csv(
        train_validation_data_path, index=False, header=False
    )

    # now, train tabtransformer model on validation data and evaluate on test data
    history, mlp_trained = train(
        model=mlp_trained,
        model_output=model_output,
        train_data_file=train_validation_data_path,
        test_data_file=test_data_path,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )

    # cleanup
    os.remove(train_data_path)
    os.remove(validation_data_path)
    os.remove(train_validation_data_path)
    #os.remove(test_data_path)

    return history




In [31]:
# model training

mlp_model = keras.models.load_model(MLP_MODEL_PATH)

for experiment in range(0, 9):

    seed = random.randint(0, 1000)

    print(f"Experiment {experiment} with seed {seed}")

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=False
    )

    history = run_experiment(
        experiment=experiment,
        mlp_model=mlp_model,
        seed=seed,
        train_data=train_data,
        validation_data=validation_data,
        test_data=test_data,
        resampled=False,
    )

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=True
    )

    history = run_experiment(
        experiment=experiment,
        mlp_model=mlp_model,
        seed=seed,
        train_data=train_data,
        validation_data=validation_data,
        test_data=test_data,
        resampled=True
    )



Experiment 0 with seed 596
Start training the model...
Epoch 1/100
    115/Unknown - 6s 11ms/step - loss: 0.0950 - acc: 0.9749 - auc: 0.7667
Epoch 1: val_loss improved from inf to 0.07034, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 0.07034 to 0.06983, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 0.06983 to 0.06967, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_unsampled/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    200/Unknown - 7s 13ms/step - loss: 0.1895 - acc: 0.9446 - auc: 0.9820
Epoch 1: val_loss improved from inf to 3.36522, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_resampled/assets
Epoch 2/100
Epoch 2: val_loss did not improve from 3.36522
Epoch 3/100
Epoch 3: val_loss improved from 3.36522 to 2.29091, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_resampled/assets
Epoch 4/100
Epoch 4: val_loss improved from 2.29091 to 2.21008, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_0_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    202/Unknown - 5s 12ms/step - loss: 0.1783 - acc: 0.9482 - auc: 0.9833
Epoch 1: val_loss improved from inf to 2.38406, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.38406 to 1.98391, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 1.98391 to 1.82833, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_1_resampled/assets
Epoch 4/100
Epoch 4:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    197/Unknown - 6s 14ms/step - loss: 0.1993 - acc: 0.9433 - auc: 0.9807
Epoch 1: val_loss improved from inf to 2.16090, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.16090 to 1.97166, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 1.97166 to 1.87245, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_2_resampled/assets
Epoch 4/100
Epoch 4:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    198/Unknown - 5s 13ms/step - loss: 0.1727 - acc: 0.9526 - auc: 0.9834
Epoch 1: val_loss improved from inf to 2.47945, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.47945 to 2.22359, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 2.22359 to 1.87808, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_3_resampled/assets
Epoch 4/100
Epoch 4:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    199/Unknown - 6s 16ms/step - loss: 0.1952 - acc: 0.9506 - auc: 0.9817
Epoch 1: val_loss improved from inf to 2.52120, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.52120 to 2.35805, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 2.35805 to 1.91710, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_4_resampled/assets
Epoch 4/100
Epoch 4:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    198/Unknown - 5s 11ms/step - loss: 0.1791 - acc: 0.9522 - auc: 0.9841
Epoch 1: val_loss improved from inf to 2.11704, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.11704 to 2.10196, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 2.10196 to 1.77362, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_5_resampled/assets
Epoch 4/100
Epoch 4:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    198/Unknown - 5s 12ms/step - loss: 0.1815 - acc: 0.9505 - auc: 0.9829
Epoch 1: val_loss improved from inf to 2.37225, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_6_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_6_resampled/assets
Epoch 2/100
Epoch 2: val_loss did not improve from 2.37225
Epoch 3/100
Epoch 3: val_loss improved from 2.37225 to 1.82887, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_6_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_6_resampled/assets
Epoch 4/100
Epoch 4: val_loss improved from 1.82887 to 1.77098, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_6_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    196/Unknown - 4s 9ms/step - loss: 0.1974 - acc: 0.9469 - auc: 0.9816
Epoch 1: val_loss improved from inf to 2.39912, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.39912 to 2.37146, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 2.37146 to 1.90492, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_7_resampled/assets
Epoch 4/100
Epoch 4: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


Start training the model...
Epoch 1/100
    201/Unknown - 4s 10ms/step - loss: 0.2039 - acc: 0.9429 - auc: 0.9819
Epoch 1: val_loss improved from inf to 2.46170, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled/assets
Epoch 2/100
Epoch 2: val_loss improved from 2.46170 to 2.39858, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled/assets
Epoch 3/100
Epoch 3: val_loss improved from 2.39858 to 1.80817, saving model to /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled
INFO:tensorflow:Assets written to: /home/haoming/Projects/python/brain-stroke-prediction/model/mlp_model_exp_8_resampled/assets
Epoch 4/100
Epoch 4:

In [32]:
def metrics_keras(model: keras.Model, test_data_file: str):
    model.compile(
        metrics=[
            keras.metrics.AUC(
                num_thresholds=200,
                curve="ROC",
            ),
            keras.metrics.BinaryAccuracy(),
            keras.metrics.Precision(),
            keras.metrics.Recall(),
            keras.metrics.TrueNegatives(),
            keras.metrics.FalseNegatives(),
            keras.metrics.FalsePositives(),
            keras.metrics.TruePositives(),
        ]
    )

    _, auc, accuracy, precision, recall, tn, fn, fp, tp = model.evaluate(
        get_dataset_from_csv(test_data_file)
    )

    # metrics
    fscore = 2 * tp / (2 * tp + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [33]:
# evaluate unsampled
results = {}
results_resampling = {}

test_data = {}


for file in Path().resolve().joinpath("dataset/").iterdir():
    if str(file.name).startswith('test_data_mlp_exp_'):
        exp_num = file.name.split('_')[4][0]

        test_data[exp_num] = str(file)

for file in Path().resolve().joinpath("model/").iterdir():
    if str(file).endswith('_unsampled') and str(file.name).startswith('mlp_model_exp_'):
        exp_num = file.name.split('_')[3]

        model = tf.keras.models.load_model(file)

        results[exp_num] = list(metrics_keras(model, test_data[exp_num]))

    if str(file).endswith('_resampled') and str(file.name).startswith('mlp_model_exp_'):
        exp_num = file.name.split('_')[3]

        model = tf.keras.models.load_model(file)

        results_resampling[exp_num] = list(metrics_keras(model, test_data[exp_num]))


results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)
results_resampling = pd.DataFrame().from_dict(
    results_resampling, orient="index", columns=RESULT_COLS
)

results["classifier"] = "MLP"
results_resampling["classifier"] = "MLP"

results.to_csv(Path().resolve().joinpath("results/MLP_eval.csv"))
results_resampling.to_csv(Path().resolve().joinpath("results/MLP_eval_resampled.csv"))


