In [2]:
import pandas as pd
import random
import sklearn

from pathlib import Path
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error

from imblearn.combine import SMOTEENN



In [3]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    smote_een = SMOTEENN(random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    return pd.DataFrame(x_resampled, columns=data.columns)


def split_train_valid_test(data_df, seed: int):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[4500:]
    train_validation_data = data_df[:4500].sample(frac=1, random_state=seed)

    train_set = resample(train_validation_data[:4000], seed)
    validation_set = train_validation_data[4000:]

    return train_set, validation_set, test_set


def prepare_data(seed: int):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))
    data_df = encode_int(data=data_df, categorical_features=CATEGORICAL_FEATURE_NAMES)

    train_df, validation_df, test_df = split_train_valid_test(data_df, seed)

    return train_df, validation_df, test_df


In [4]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 100
NUM_ROUNDS=50

def metrics(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=y_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [5]:
# evaluate xgboost
from xgboost.callback import EarlyStopping

results = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(seed=experiment)

    # split labels
    x, y = split_label(train_data)
    x_valid, y_valid = split_label(validation_data)
    x_test, y_test = split_label(test_data)

    # metrics
    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
    
    es = EarlyStopping(
        rounds=5,
        save_best=True,
        maximize=True,
        data_name="validation_0",
        metric_name="auc",
    )
    
    clf = xgb.sklearn.XGBClassifier(
        max_depth=param["max_depth"],
        eta=param['eta'],
        objective=param["objective"],
        gpu_id=0,
        predictor='gpu_predictor',
        eval_metric='auc',
        early_stopping_rounds=5,
        callbacks=[es]
    )

    clf.fit(
        x, y,
        eval_set=[(x_test, y_test), (x, y)]
    )

    y_pred = clf.predict(x_test)

    results[experiment] = list(metrics(y_pred, y_test))

results = pd.DataFrame().from_dict(results, orient="index", columns=RESULT_COLS)

results.to_csv(Path().resolve().joinpath("results/XGBoost.csv"))

results


[0]	validation_0-auc:0.77240	validation_1-auc:0.91073
[1]	validation_0-auc:0.76537	validation_1-auc:0.92068
[2]	validation_0-auc:0.77566	validation_1-auc:0.94110
[3]	validation_0-auc:0.76160	validation_1-auc:0.94976
[4]	validation_0-auc:0.76799	validation_1-auc:0.95507
[5]	validation_0-auc:0.77919	validation_1-auc:0.95884
[6]	validation_0-auc:0.77617	validation_1-auc:0.96287
[7]	validation_0-auc:0.76656	validation_1-auc:0.96599
[8]	validation_0-auc:0.76233	validation_1-auc:0.96849
[9]	validation_0-auc:0.76969	validation_1-auc:0.97014
[10]	validation_0-auc:0.77134	validation_1-auc:0.97186
[0]	validation_0-auc:0.80009	validation_1-auc:0.90378
[1]	validation_0-auc:0.76958	validation_1-auc:0.93550
[2]	validation_0-auc:0.75825	validation_1-auc:0.94391
[3]	validation_0-auc:0.76478	validation_1-auc:0.95523
[4]	validation_0-auc:0.77066	validation_1-auc:0.96118
[0]	validation_0-auc:0.75532	validation_1-auc:0.91769
[1]	validation_0-auc:0.73687	validation_1-auc:0.93720
[2]	validation_0-auc:0.7187

Unnamed: 0,precision,recall,fscore,accuracy,auc,miss_rate,fall_out_rate
0,0.165517,0.648649,0.263736,0.721414,0.688063,0.037464,0.272523
1,0.142857,0.575758,0.228916,0.733888,0.660647,0.039660,0.254464
2,0.126984,0.571429,0.207792,0.746362,0.664301,0.033426,0.242826
3,0.065693,0.750000,0.120805,0.727651,0.738539,0.008571,0.272921
4,0.123810,0.565217,0.203125,0.787942,0.682172,0.026385,0.200873
...,...,...,...,...,...,...,...
95,0.120690,0.666667,0.204380,0.773389,0.722464,0.018817,0.221739
96,0.156522,0.692308,0.255319,0.781705,0.739560,0.021277,0.213187
97,0.098361,0.571429,0.167832,0.752599,0.666149,0.024862,0.239130
98,0.082645,0.526316,0.142857,0.750520,0.643028,0.024931,0.240260
