In [25]:
import pandas as pd
import random
import sklearn

from pathlib import Path
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error

from imblearn.combine import SMOTEENN



In [17]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    smote_een = SMOTEENN(random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    return pd.DataFrame(x_resampled, columns=data.columns)


def split_train_valid_test(data_df, seed: int):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[4500:]
    train_validation_data = data_df[:4500].sample(frac=1, random_state=seed)

    train_set = resample(train_validation_data[:4000], seed)
    validation_set = train_validation_data[4000:]

    return train_set, validation_set, test_set


def prepare_data(seed: int):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))
    data_df = encode_int(data=data_df, categorical_features=CATEGORICAL_FEATURE_NAMES)

    train_df, validation_df, test_df = split_train_valid_test(data_df, seed)

    return train_df, validation_df, test_df


In [18]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 100
NUM_ROUNDS=50

def metrics(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [45]:
# evaluate xgboost
from xgboost.callback import EarlyStopping

results = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(seed=experiment)

    # split labels
    x, y = split_label(train_data)
    x_valid, y_valid = split_label(validation_data)
    x_test, y_test = split_label(test_data)

    # metrics
    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
    
    es = EarlyStopping(
        rounds=5,
        save_best=True,
        maximize=True,
        data_name="validation_0",
        metric_name="auc",
    )
    
    clf = xgb.sklearn.XGBClassifier(
        max_depth=param["max_depth"],
        eta=param['eta'],
        objective=param["objective"],
        gpu_id=0,
        predictor='gpu_predictor',
        eval_metric='auc',
        early_stopping_rounds=5,
        callbacks=[es]
    )

    clf.fit(
        x, y,
        eval_set=[(x_valid, y_valid), (x, y)]
    )

    y_pred = clf.predict(x_test)

    results[experiment] = list(metrics(y_pred, y_test))

results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)

results.to_csv(Path().resolve().joinpath("results/XGBoost.csv"))

results


[0]	validation_0-auc:0.81672	validation_1-auc:0.90667
[1]	validation_0-auc:0.81204	validation_1-auc:0.93274
[2]	validation_0-auc:0.82366	validation_1-auc:0.94178
[3]	validation_0-auc:0.82400	validation_1-auc:0.95367
[4]	validation_0-auc:0.83204	validation_1-auc:0.95900
[5]	validation_0-auc:0.84400	validation_1-auc:0.96287
[6]	validation_0-auc:0.82661	validation_1-auc:0.96566
[7]	validation_0-auc:0.83423	validation_1-auc:0.96907
[8]	validation_0-auc:0.81819	validation_1-auc:0.97018
[9]	validation_0-auc:0.82505	validation_1-auc:0.97202
[0]	validation_0-auc:0.79032	validation_1-auc:0.89991
[1]	validation_0-auc:0.78553	validation_1-auc:0.93592
[2]	validation_0-auc:0.78498	validation_1-auc:0.94423
[3]	validation_0-auc:0.77594	validation_1-auc:0.95526
[4]	validation_0-auc:0.80189	validation_1-auc:0.96123
[5]	validation_0-auc:0.78655	validation_1-auc:0.96561
[6]	validation_0-auc:0.78011	validation_1-auc:0.96863
[7]	validation_0-auc:0.78047	validation_1-auc:0.97133
[8]	validation_0-auc:0.78769

Unnamed: 0,precision,recall,fscore,accuracy,auc,miss_rate,fall_out_rate
0,0.144000,0.486486,0.222222,0.738046,0.622748,0.053521,0.240991
1,0.161290,0.454545,0.238095,0.800416,0.640219,0.046753,0.174107
2,0.159091,0.750000,0.262500,0.754678,0.752483,0.019284,0.245033
3,0.065693,0.750000,0.120805,0.727651,0.738539,0.008571,0.272921
4,0.145038,0.826087,0.246753,0.758836,0.790773,0.010959,0.244541
...,...,...,...,...,...,...,...
95,0.133929,0.714286,0.225564,0.785863,0.751708,0.015873,0.210870
96,0.153846,0.692308,0.251748,0.777547,0.737363,0.021390,0.217582
97,0.098485,0.619048,0.169935,0.735967,0.680176,0.022599,0.258696
98,0.102564,0.631579,0.176471,0.767152,0.702153,0.018970,0.227273
