In [26]:
import keras_preprocessing
from keras import layers
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

from pathlib import Path
import os
import pandas as pd
import numpy as np

gpu_number = 0  #### GPU number
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    tf.config.experimental.set_visible_devices(gpus[gpu_number], "GPU")
    logical_gpus = tf.config.experimental.list_logical_devices("GPU")
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")


2 Physical GPUs, 1 Logical GPU


In [27]:
import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC


In [28]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]


CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)[df.columns]

    return df_scaled


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_validation_data[: round(len(data_df) * 0.70)],
            seed,
            categorical_features=CATEGORICAL_FEATURE_NAMES,
        )

    return train_set, validation_set, test_set


def prepare_data(seed: int, resample_training: bool):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return train_df, validation_df, test_df


In [29]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.1
BATCH_SIZE = 32
NUM_EPOCHS = 100

MLP_MODEL_PATH = str(Path().resolve().joinpath("model/mlp_model"))
TABTRANSFORMER_MODEL_PATH = str(Path().resolve().joinpath("model/tabtransformer_model"))

TARGET_FEATURE_NAME = "stroke"
TARGET_LABELS = ["1", "0"]


In [30]:
# data proccessing pipeline

target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    # target_index = target_label_lookup(target)
    target_index = target
    return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    """dataset from, csv"""
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()


def get_dataset_from_df(df, batch_size=128, shuffle=False):
    """dataset from, csv"""

    dataset = tf.data.Dataset.from_tensor_slices(
        (
            df[NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES].to_dict(
                orient="list"
            ),
            tf.constant(df.loc[:, TARGET_FEATURE_NAME]),
        )
    )

    return dataset.cache()


get_dataset_from_df(pd.read_csv("dataset/full_data_clean.csv"))

# tf.convert_to_tensor(pd.read_csv("dataset/full_data_clean.csv").loc[:, TARGET_FEATURE_NAME])
# get_dataset_from_csv("dataset/full_data_clean.csv")


  return bool(asarray(a1 == a2).all())


<CacheDataset element_spec=({'age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'avg_glucose_level': TensorSpec(shape=(), dtype=tf.float32, name=None), 'bmi': TensorSpec(shape=(), dtype=tf.float32, name=None), 'gender': TensorSpec(shape=(), dtype=tf.string, name=None), 'hypertension': TensorSpec(shape=(), dtype=tf.string, name=None), 'heart_disease': TensorSpec(shape=(), dtype=tf.string, name=None), 'ever_married': TensorSpec(shape=(), dtype=tf.string, name=None), 'work_type': TensorSpec(shape=(), dtype=tf.string, name=None), 'residence_type': TensorSpec(shape=(), dtype=tf.string, name=None), 'smoking_status': TensorSpec(shape=(), dtype=tf.string, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [31]:
# training and evaluation


def train(
    model: keras.Model,
    train_data_file,
    test_data_file,
    model_output,
    num_epochs,
    EPOCHS_TO_WAIT_FOR_IMPROVE,
    learning_rate,
    batch_size,
):
    """Implement a training and evaluation procedure"""
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=WEIGHT_DECAY
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    metrics = (
        [
            keras.metrics.BinaryAccuracy(name="acc"),
            keras.metrics.AUC(name="auc"),
        ],
    )

    early_stop_callback = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    )
    # checkpoint_callback = keras.callbacks.ModelCheckpoint(model_output, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    # early_stop_callback = keras.callbacks.EarlyStopping(
    #    monitor="val_auc", patience=EPOCHS_TO_WAIT_FOR_IMPROVE
    # )

    # checkpoint_callback = keras.callbacks.ModelCheckpoint(
    #    model_output,
    #    monitor="val_auc",
    #    verbose=1,
    #    save_best_only=True,
    #    mode="max",
    # )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics,
    )

    print("Start training the model...")

    history = model.fit(
        train_dataset,
        epochs=num_epochs,
        validation_data=validation_dataset,
        callbacks=[
            # checkpoint_callback,
            early_stop_callback
        ],
    )

    print("Model training finished")

    _, accuracy, auc = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}% AUC: {auc}")

    return history, model


In [32]:
RESULT_COLS = [
    "classifier" "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

NUM_EXPERIMENTS = 10
EPOCHS_TO_WAIT_FOR_IMPROVE = 15


def metrics_keras(model: keras.Model, test_data_file: str):
    model.compile(
        metrics=[
            keras.metrics.AUC(
                num_thresholds=200,
                curve="ROC",
            ),
            keras.metrics.Precision(),
            keras.metrics.Recall(),
            keras.metrics.TrueNegatives(),
            keras.metrics.FalsePositives(),
            keras.metrics.TrueNegatives(),
            keras.metrics.TruePositives(),
        ]
    )

    _, auc, precision, recall, tn, fn, fp, tp = model.evaluate(
        get_dataset_from_csv(test_data_file)
    )

    # metrics
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [33]:
def run_experiment(experiment, mlp_model, seed, train_data, validation_data, test_data):
    # split labels
    train_data_path = str(
        Path().resolve().joinpath(f"dataset/train_data_mlp_exp_{experiment}.csv")
    )
    validation_data_path = str(
        Path().resolve().joinpath(f"dataset/validation_data_mlp_exp_{experiment}.csv")
    )
    test_data_path = str(
        Path().resolve().joinpath(f"dataset/test_data_mlp_exp_{experiment}.csv")
    )

    train_data.to_csv(train_data_path, header=False, index=False)
    validation_data.to_csv(validation_data_path, header=False, index=False)
    test_data.to_csv(test_data_path, header=False, index=False)

    x_test, y_test = split_label(test_data)

    # train tabtransformer model on training data and evaluate on validation data
    history, mlp_trained = train(
        model=mlp_model,
        train_data_file=train_data_path,
        test_data_file=validation_data_path,
        model_output=MLP_MODEL_PATH,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )

    train_validation_data_path = str(
        Path()
        .resolve()
        .joinpath(f"dataset/train_validation_data_mlp_exp_{experiment}.csv")
    )

    pd.concat([train_data, validation_data]).sample(frac=1, random_state=seed).to_csv(
        train_validation_data_path, index=False, header=False
    )

    # now, train tabtransformer model on validation data and evaluate on test data
    history, mlp_trained = train(
        model=mlp_trained,
        train_data_file=train_validation_data_path,
        test_data_file=test_data_path,
        model_output=MLP_MODEL_PATH,
        num_epochs=NUM_EPOCHS,
        EPOCHS_TO_WAIT_FOR_IMPROVE=EPOCHS_TO_WAIT_FOR_IMPROVE,
        learning_rate=LEARNING_RATE,
        batch_size=BATCH_SIZE,
    )

    metrics = metrics_keras(model=mlp_model, test_data_file=test_data_file)

    # cleanup
    os.remove(train_data_path)
    os.remove(validation_data_path)
    os.remove(train_validation_data_path)
    os.remove(test_data_path)

    return metrics


In [34]:
mlp_model = keras.models.load_model(MLP_MODEL_PATH)

results = {}
results_resampling = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=False
    )

    results[experiment] = list(
        run_experiment(
            experiment=experiment,
            mlp_model=mlp_model,
            seed=seed,
            train_data=train_data,
            validation_data=validation_data,
            test_data=test_data,
        )
    )

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=True
    )

    results_resampling[experiment] = list(
        run_experiment(
            experiment=experiment,
            mlp_model=mlp_model,
            seed=seed,
            train_data=train_data,
            validation_data=validation_data,
            test_data=test_data,
        )
    )


results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)
results_resampling = pd.DataFrame().from_dict(
    results_resampling, orient="index", columns=RESULT_COLS
)

results["classifier"] = "MLP"
results_resampling["classifier"] = "MLP"

results.to_csv(Path().resolve().joinpath("results/MLP_eval.csv"))
results_resampling.to_csv(Path().resolve().joinpath("results/MLP_eval_resampled.csv"))


Start training the model...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

In [None]:
results


In [None]:
results_resampling
