In [1]:
import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from imblearn.combine import SMOTEENN


In [2]:
CSV_HEADER = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "avg_glucose_level",
    "bmi",
    "smoking_status",
    "stroke",
]

CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    smote_een = SMOTEENN(random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    return pd.DataFrame(x_resampled, columns=data.columns)


def split_train_valid_test(data_df, seed: int):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[4500:]
    train_validation_data = data_df[:4500].sample(frac=1, random_state=seed)

    train_set = resample(train_validation_data[:4000], seed)
    validation_set = train_validation_data[4000:]

    return train_set, validation_set, test_set


def prepare_data(seed: int):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))
    data_df = encode_int(data=data_df, categorical_features=CATEGORICAL_FEATURE_NAMES)

    train_df, validation_df, test_df = split_train_valid_test(data_df, seed)

    return train_df, validation_df, test_df


In [3]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 100


def metrics(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [4]:
def load_data(train_file, validation_file, test_file, headers, categorical_features):
    train_df = pd.read_csv(train_file, names=headers)
    validation_df = pd.read_csv(validation_file, names=headers)
    test_df = pd.read_csv(test_file, names=headers)

    encoded = [encode_int(df, categorical_features) for df in [train_df, validation_df, test_df]]

    return encoded[0], encoded[1], encoded[2]

train_data_path = Path().resolve().joinpath("dataset/train_data.csv")
validation_data_path = Path().resolve().joinpath("dataset/validation_data.csv")
test_data_path = Path().resolve().joinpath("dataset/test_data.csv")

train_data_file = str(train_data_path.absolute())
validation_data_file = str(validation_data_path.absolute())
test_data_file = str(test_data_path.absolute())

train_data, validation_data, test_data = load_data(
    train_file=train_data_file,
    validation_file=validation_data_file,
    test_file=test_data_file,
    headers=CSV_HEADER,
    categorical_features=CATEGORICAL_FEATURE_NAMES,
)

x, y = split_label(train_data)
x_valid, y_valid = split_label(validation_data)
x_test, y_test = split_label(test_data)


def eval_random_forest():
    rforest_clf = RandomForestClassifier()
    rforest_clf.fit(x, y)

    y_pred_rf = rforest_clf.predict(x_test)

    auc = roc_auc_score(y_score=rforest_clf.predict_proba(x_test)[:, 1], y_true=y_test)    # type: ignore
    result = {0: list(metrics(y_pred_rf, y_test))}

    return pd.DataFrame.from_dict(result, orient='index', columns=RESULT_COLS)

eval_random_forest()

Unnamed: 0,precision,recall,fscore,accuracy,auc,miss_rate,fall_out_rate
0,0.197802,0.62069,0.3,0.825364,0.729593,0.027708,0.161504


In [5]:
# evaluate RandomForest

results = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(seed=seed)

    # split labels
    x, y = split_label(train_data)
    x_valid, y_valid = split_label(validation_data)
    x_test, y_test = split_label(test_data)

    # metrics
    clf = RandomForestClassifier()
    clf.fit(x, y)
    y_pred = clf.predict(x_test)

    results[experiment] = list(metrics(y_pred, y_test))

results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)

results.to_csv(Path().resolve().joinpath('results/RandomForest.csv'))

results

Unnamed: 0,precision,recall,fscore,accuracy,auc,miss_rate,fall_out_rate
0,0.211268,0.555556,0.306122,0.858628,0.716104,0.029056,0.123348
1,0.138889,0.434783,0.210526,0.844075,0.649706,0.032020,0.135371
2,0.220588,0.468750,0.300000,0.854470,0.675355,0.041363,0.118040
3,0.202899,0.538462,0.294737,0.860707,0.708791,0.028986,0.120879
4,0.200000,0.583333,0.297872,0.862786,0.730398,0.024096,0.122538
...,...,...,...,...,...,...,...
95,0.089552,0.285714,0.136364,0.841996,0.576553,0.037037,0.132609
96,0.236364,0.433333,0.305882,0.877339,0.670103,0.040284,0.093126
97,0.121951,0.500000,0.196078,0.829522,0.671909,0.025063,0.156182
98,0.194805,0.428571,0.267857,0.829522,0.644779,0.050125,0.139013
