In [23]:
import pandas as pd
import random
import sklearn

from pathlib import Path
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC


In [24]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)

    return df_scaled


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_set, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_set, validation_set, test_set


def one_hot_encode(
    data_df: pd.DataFrame,
    train_df: pd.DataFrame,
    validation_df: pd.DataFrame,
    test_df: pd.DataFrame,
):
    # ensure train, validation, and test all have the same columns after ohe

    ohe_columns = encode_int(data_df, CATEGORICAL_FEATURE_NAMES).columns

    train_df_e, validation_df_e, test_df_e = [
        pd.DataFrame(
            encode_int(data=df, categorical_features=CATEGORICAL_FEATURE_NAMES),
            columns=ohe_columns,
        ).fillna(0)
        for df in [train_df, validation_df, test_df]
    ]

    return train_df_e, validation_df_e, test_df_e


def prepare_data(seed: int, resample_training: bool):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return one_hot_encode(data_df, train_df, validation_df, test_df)


In [25]:
RESULT_COLS = [
    "classifier",
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 10
NUM_ROUNDS = 20


def metrics(y_pred, p_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=p_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [26]:
def run_experiment(seed, experiment, train_data, validation_data, test_data):
    # split labels
    train_validation = pd.concat([train_data, validation_data]).sample(
        frac=1, random_state=seed
    )

    x, y = split_label(train_validation)
    x_test, y_test = split_label(test_data)

    # metrics
    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}

    es = EarlyStopping(
        rounds=3,
        save_best=True,
        maximize=True,
        data_name="validation_0",
        metric_name="auc",
    )

    clf = xgb.sklearn.XGBClassifier(
        learning_rate=0.1,
        objective=param["objective"],
        gpu_id=0,
        predictor="gpu_predictor",
    )

    clf.fit(x, y, eval_set=[(x_test, y_test), (x, y)])

    y_pred = clf.predict(x_test)
    p_pred = clf.predict_proba(x_test)[:, 1]

    return y_pred, p_pred, y_test


In [28]:
# evaluate xgboost

results = {}
results_resample = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(
        seed=experiment, resample_training=False
    )

    y_pred, p_pred, y_test = run_experiment(
        seed, experiment, train_data, validation_data, test_data
    )

    results[experiment] = list(metrics(y_pred, p_pred, y_test))

    train_data, validation_data, test_data = prepare_data(
        seed=experiment, resample_training=True
    )

    y_pred, p_pred, y_test = run_experiment(
        seed, experiment, train_data, validation_data, test_data
    )

    results_resample[experiment] = list(metrics(y_pred, p_pred, y_test))


results = (
    pd.DataFrame()
    .from_dict(results, orient="index", columns=RESULT_COLS)
    .assign(classifier="XGBoost")
)
results_resample = (
    pd.DataFrame()
    .from_dict(results_resample, orient="index", columns=RESULT_COLS)
    .assign(classifier="XGBoost")
)

results.to_csv(Path().resolve().joinpath("results/XGBoost_eval.csv"))
results_resample.to_csv(Path().resolve().joinpath("results/XGBoost_eval_resampled.csv"))


[0]	validation_0-logloss:0.60470	validation_1-logloss:0.60458
[1]	validation_0-logloss:0.53225	validation_1-logloss:0.53207
[2]	validation_0-logloss:0.47204	validation_1-logloss:0.47171
[3]	validation_0-logloss:0.42115	validation_1-logloss:0.42079
[4]	validation_0-logloss:0.37791	validation_1-logloss:0.37738
[5]	validation_0-logloss:0.34087	validation_1-logloss:0.34007
[6]	validation_0-logloss:0.30877	validation_1-logloss:0.30783
[7]	validation_0-logloss:0.28103	validation_1-logloss:0.27981
[8]	validation_0-logloss:0.25671	validation_1-logloss:0.25538
[9]	validation_0-logloss:0.23548	validation_1-logloss:0.23399
[10]	validation_0-logloss:0.21679	validation_1-logloss:0.21517
[11]	validation_0-logloss:0.20041	validation_1-logloss:0.19861
[12]	validation_0-logloss:0.18593	validation_1-logloss:0.18401
[13]	validation_0-logloss:0.17318	validation_1-logloss:0.17108
[14]	validation_0-logloss:0.16197	validation_1-logloss:0.15963
[15]	validation_0-logloss:0.15200	validation_1-logloss:0.14951
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.66084	validation_1-logloss:0.63840
[1]	validation_0-logloss:0.63752	validation_1-logloss:0.59322
[2]	validation_0-logloss:0.61621	validation_1-logloss:0.55557
[3]	validation_0-logloss:0.60097	validation_1-logloss:0.52365
[4]	validation_0-logloss:0.58635	validation_1-logloss:0.49672
[5]	validation_0-logloss:0.57554	validation_1-logloss:0.47229
[6]	validation_0-logloss:0.56749	validation_1-logloss:0.45100
[7]	validation_0-logloss:0.56020	validation_1-logloss:0.43224
[8]	validation_0-logloss:0.55643	validation_1-logloss:0.41609
[9]	validation_0-logloss:0.55402	validation_1-logloss:0.40201
[10]	validation_0-logloss:0.55070	validation_1-logloss:0.38891
[11]	validation_0-logloss:0.54674	validation_1-logloss:0.37749
[12]	validation_0-logloss:0.54584	validation_1-logloss:0.36670
[13]	validation_0-logloss:0.54248	validation_1-logloss:0.35743
[14]	validation_0-logloss:0.54041	validation_1-logloss:0.34937
[15]	validation_0-logloss:0.53837	validation_1-logloss:0.34110
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65936	validation_1-logloss:0.63498
[1]	validation_0-logloss:0.63505	validation_1-logloss:0.58792
[2]	validation_0-logloss:0.61496	validation_1-logloss:0.54817
[3]	validation_0-logloss:0.59996	validation_1-logloss:0.51369
[4]	validation_0-logloss:0.58842	validation_1-logloss:0.48486
[5]	validation_0-logloss:0.58115	validation_1-logloss:0.45924
[6]	validation_0-logloss:0.57489	validation_1-logloss:0.43720
[7]	validation_0-logloss:0.56841	validation_1-logloss:0.41805
[8]	validation_0-logloss:0.55922	validation_1-logloss:0.40061
[9]	validation_0-logloss:0.55802	validation_1-logloss:0.38622
[10]	validation_0-logloss:0.55410	validation_1-logloss:0.37280
[11]	validation_0-logloss:0.55466	validation_1-logloss:0.36148
[12]	validation_0-logloss:0.55065	validation_1-logloss:0.35046
[13]	validation_0-logloss:0.54874	validation_1-logloss:0.34057
[14]	validation_0-logloss:0.54877	validation_1-logloss:0.33182
[15]	validation_0-logloss:0.54904	validation_1-logloss:0.32433
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65808	validation_1-logloss:0.63709
[1]	validation_0-logloss:0.63039	validation_1-logloss:0.59054
[2]	validation_0-logloss:0.61211	validation_1-logloss:0.55139
[3]	validation_0-logloss:0.59402	validation_1-logloss:0.51746
[4]	validation_0-logloss:0.57838	validation_1-logloss:0.48823
[5]	validation_0-logloss:0.57191	validation_1-logloss:0.46418
[6]	validation_0-logloss:0.56572	validation_1-logloss:0.44307
[7]	validation_0-logloss:0.55989	validation_1-logloss:0.42435
[8]	validation_0-logloss:0.55146	validation_1-logloss:0.40654
[9]	validation_0-logloss:0.54625	validation_1-logloss:0.39213
[10]	validation_0-logloss:0.54182	validation_1-logloss:0.37953
[11]	validation_0-logloss:0.53720	validation_1-logloss:0.36692
[12]	validation_0-logloss:0.53730	validation_1-logloss:0.35684
[13]	validation_0-logloss:0.53327	validation_1-logloss:0.34685
[14]	validation_0-logloss:0.53175	validation_1-logloss:0.33769
[15]	validation_0-logloss:0.52845	validation_1-logloss:0.32891
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65959	validation_1-logloss:0.63708
[1]	validation_0-logloss:0.63033	validation_1-logloss:0.59027
[2]	validation_0-logloss:0.60634	validation_1-logloss:0.55145
[3]	validation_0-logloss:0.58628	validation_1-logloss:0.51796
[4]	validation_0-logloss:0.57460	validation_1-logloss:0.48946
[5]	validation_0-logloss:0.56123	validation_1-logloss:0.46459
[6]	validation_0-logloss:0.55217	validation_1-logloss:0.44240
[7]	validation_0-logloss:0.54554	validation_1-logloss:0.42375
[8]	validation_0-logloss:0.53895	validation_1-logloss:0.40605
[9]	validation_0-logloss:0.53358	validation_1-logloss:0.39090
[10]	validation_0-logloss:0.52690	validation_1-logloss:0.37761
[11]	validation_0-logloss:0.52395	validation_1-logloss:0.36578
[12]	validation_0-logloss:0.51910	validation_1-logloss:0.35453
[13]	validation_0-logloss:0.51786	validation_1-logloss:0.34453
[14]	validation_0-logloss:0.51807	validation_1-logloss:0.33608
[15]	validation_0-logloss:0.51698	validation_1-logloss:0.32791
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65741	validation_1-logloss:0.63618
[1]	validation_0-logloss:0.62908	validation_1-logloss:0.58922
[2]	validation_0-logloss:0.60660	validation_1-logloss:0.55023
[3]	validation_0-logloss:0.58884	validation_1-logloss:0.51503
[4]	validation_0-logloss:0.57732	validation_1-logloss:0.48613
[5]	validation_0-logloss:0.56761	validation_1-logloss:0.46114
[6]	validation_0-logloss:0.56026	validation_1-logloss:0.43984
[7]	validation_0-logloss:0.55484	validation_1-logloss:0.42095
[8]	validation_0-logloss:0.55004	validation_1-logloss:0.40359
[9]	validation_0-logloss:0.54338	validation_1-logloss:0.38811
[10]	validation_0-logloss:0.53847	validation_1-logloss:0.37477
[11]	validation_0-logloss:0.53586	validation_1-logloss:0.36327
[12]	validation_0-logloss:0.53391	validation_1-logloss:0.35273
[13]	validation_0-logloss:0.53518	validation_1-logloss:0.34286
[14]	validation_0-logloss:0.53567	validation_1-logloss:0.33461
[15]	validation_0-logloss:0.53664	validation_1-logloss:0.32731
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65882	validation_1-logloss:0.63727
[1]	validation_0-logloss:0.63250	validation_1-logloss:0.59050
[2]	validation_0-logloss:0.60939	validation_1-logloss:0.55119
[3]	validation_0-logloss:0.58919	validation_1-logloss:0.51801
[4]	validation_0-logloss:0.57451	validation_1-logloss:0.48922
[5]	validation_0-logloss:0.56498	validation_1-logloss:0.46503
[6]	validation_0-logloss:0.55546	validation_1-logloss:0.44346
[7]	validation_0-logloss:0.54737	validation_1-logloss:0.42477
[8]	validation_0-logloss:0.53981	validation_1-logloss:0.40818
[9]	validation_0-logloss:0.53717	validation_1-logloss:0.39415
[10]	validation_0-logloss:0.53188	validation_1-logloss:0.37952
[11]	validation_0-logloss:0.52720	validation_1-logloss:0.36626
[12]	validation_0-logloss:0.52571	validation_1-logloss:0.35506
[13]	validation_0-logloss:0.52214	validation_1-logloss:0.34543
[14]	validation_0-logloss:0.52273	validation_1-logloss:0.33647
[15]	validation_0-logloss:0.52188	validation_1-logloss:0.32762
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.66107	validation_1-logloss:0.63628
[1]	validation_0-logloss:0.63616	validation_1-logloss:0.58966
[2]	validation_0-logloss:0.61875	validation_1-logloss:0.55057
[3]	validation_0-logloss:0.60315	validation_1-logloss:0.51701
[4]	validation_0-logloss:0.59188	validation_1-logloss:0.48797
[5]	validation_0-logloss:0.58353	validation_1-logloss:0.46352
[6]	validation_0-logloss:0.57739	validation_1-logloss:0.44241
[7]	validation_0-logloss:0.56949	validation_1-logloss:0.42307
[8]	validation_0-logloss:0.56466	validation_1-logloss:0.40652
[9]	validation_0-logloss:0.56238	validation_1-logloss:0.39229
[10]	validation_0-logloss:0.56206	validation_1-logloss:0.38003
[11]	validation_0-logloss:0.56002	validation_1-logloss:0.36860
[12]	validation_0-logloss:0.55995	validation_1-logloss:0.35867
[13]	validation_0-logloss:0.56139	validation_1-logloss:0.34922
[14]	validation_0-logloss:0.56152	validation_1-logloss:0.34151
[15]	validation_0-logloss:0.56329	validation_1-logloss:0.33444
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.66101	validation_1-logloss:0.63786
[1]	validation_0-logloss:0.63480	validation_1-logloss:0.59182
[2]	validation_0-logloss:0.61604	validation_1-logloss:0.55352
[3]	validation_0-logloss:0.59929	validation_1-logloss:0.52102
[4]	validation_0-logloss:0.58586	validation_1-logloss:0.49233
[5]	validation_0-logloss:0.57429	validation_1-logloss:0.46666
[6]	validation_0-logloss:0.56587	validation_1-logloss:0.44528
[7]	validation_0-logloss:0.56001	validation_1-logloss:0.42544
[8]	validation_0-logloss:0.55442	validation_1-logloss:0.40889
[9]	validation_0-logloss:0.55270	validation_1-logloss:0.39423
[10]	validation_0-logloss:0.54740	validation_1-logloss:0.38116
[11]	validation_0-logloss:0.54643	validation_1-logloss:0.36902
[12]	validation_0-logloss:0.54489	validation_1-logloss:0.35894
[13]	validation_0-logloss:0.54527	validation_1-logloss:0.34984
[14]	validation_0-logloss:0.54694	validation_1-logloss:0.34153
[15]	validation_0-logloss:0.54557	validation_1-logloss:0.33358
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.65451	validation_1-logloss:0.63459
[1]	validation_0-logloss:0.62471	validation_1-logloss:0.58672
[2]	validation_0-logloss:0.60039	validation_1-logloss:0.54599
[3]	validation_0-logloss:0.58223	validation_1-logloss:0.51213
[4]	validation_0-logloss:0.56853	validation_1-logloss:0.48348
[5]	validation_0-logloss:0.55863	validation_1-logloss:0.45917
[6]	validation_0-logloss:0.54898	validation_1-logloss:0.43625
[7]	validation_0-logloss:0.54351	validation_1-logloss:0.41740
[8]	validation_0-logloss:0.53763	validation_1-logloss:0.40115
[9]	validation_0-logloss:0.53354	validation_1-logloss:0.38582
[10]	validation_0-logloss:0.53135	validation_1-logloss:0.37323
[11]	validation_0-logloss:0.52727	validation_1-logloss:0.36026
[12]	validation_0-logloss:0.52402	validation_1-logloss:0.35004
[13]	validation_0-logloss:0.52154	validation_1-logloss:0.34036
[14]	validation_0-logloss:0.52148	validation_1-logloss:0.33237
[15]	validation_0-logloss:0.52021	validation_1-logloss:0.32390
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.66163	validation_1-logloss:0.63793
[1]	validation_0-logloss:0.63558	validation_1-logloss:0.59159
[2]	validation_0-logloss:0.61633	validation_1-logloss:0.55303
[3]	validation_0-logloss:0.60165	validation_1-logloss:0.52005
[4]	validation_0-logloss:0.59132	validation_1-logloss:0.49218
[5]	validation_0-logloss:0.58030	validation_1-logloss:0.46809
[6]	validation_0-logloss:0.57248	validation_1-logloss:0.44655
[7]	validation_0-logloss:0.56679	validation_1-logloss:0.42829
[8]	validation_0-logloss:0.56268	validation_1-logloss:0.41137
[9]	validation_0-logloss:0.55953	validation_1-logloss:0.39692
[10]	validation_0-logloss:0.55573	validation_1-logloss:0.38423
[11]	validation_0-logloss:0.55337	validation_1-logloss:0.37327
[12]	validation_0-logloss:0.55439	validation_1-logloss:0.36227
[13]	validation_0-logloss:0.55383	validation_1-logloss:0.35296
[14]	validation_0-logloss:0.55301	validation_1-logloss:0.34254
[15]	validation_0-logloss:0.55312	validation_1-logloss:0.33547
[1