In [19]:
import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from imblearn.combine import SMOTEENN


In [20]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    smote_een = SMOTEENN(random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    return pd.DataFrame(x_resampled, columns=data.columns)


def split_train_valid_test(data_df, seed: int):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[4500:]
    train_validation_data = data_df[:4500].sample(frac=1, random_state=seed)

    train_set = resample(train_validation_data[:4000], seed)
    validation_set = train_validation_data[4000:]

    return train_set, validation_set, test_set


def prepare_data(seed: int):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))
    data_df = encode_int(data=data_df, categorical_features=CATEGORICAL_FEATURE_NAMES)

    train_df, validation_df, test_df = split_train_valid_test(data_df, seed)

    return train_df, validation_df, test_df


In [21]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 100


def metrics(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [22]:
# evaluate DecisionTree


results = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(seed=experiment)

    # split labels
    x, y = split_label(train_data)
    x_valid, y_valid = split_label(validation_data)
    x_test, y_test = split_label(test_data)

    # metrics
    clf = DecisionTreeClassifier()
    clf.fit(x, y)
    y_pred = clf.predict(x_test)

    results[experiment] = list(metrics(y_pred, y_test))

results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)

results.to_csv(Path().resolve().joinpath("results/DecisionTree.csv"))

results


Unnamed: 0,precision,recall,fscore,accuracy,auc,miss_rate,fall_out_rate
0,0.151515,0.270270,0.194175,0.827443,0.572072,0.067839,0.126126
1,0.154930,0.333333,0.211538,0.829522,0.599702,0.055138,0.133929
2,0.138298,0.464286,0.213115,0.800416,0.642739,0.038961,0.178808
3,0.086022,0.666667,0.152381,0.814969,0.742715,0.010204,0.181237
4,0.164179,0.478261,0.244444,0.858628,0.677995,0.029056,0.122271
...,...,...,...,...,...,...,...
95,0.100000,0.333333,0.153846,0.839917,0.598188,0.034653,0.136957
96,0.147727,0.500000,0.228070,0.817048,0.667582,0.033079,0.164835
97,0.150685,0.523810,0.234043,0.850312,0.694513,0.024450,0.134783
98,0.144578,0.631579,0.235294,0.837838,0.738950,0.017370,0.153680
