In [53]:
import pandas as pd
import random
import sklearn

from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC


In [54]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)

    return df_scaled


def split_train_valid_test_stratified(data_df, seed: int, resample_training: bool):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=seed)
    d_x, d_y = split_label(data_df)

    train_index, valid_test_index = list(sss.split(d_x, d_y))[0]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    v_t_x, v_t_y = split_label(data_df.iloc[valid_test_index])

    validation_index, test_index = list(sss.split(v_t_x, v_t_y))[0]

    train_df = data_df.iloc[train_index.tolist()]
    validation_df = data_df.iloc[validation_index.tolist()]
    test_df = data_df.iloc[test_index.tolist()]

    if resample_training:
        train_set = resample(
            train_df, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_df, validation_df, test_df


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_set, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_set, validation_set, test_set


def one_hot_encode(
    data_df: pd.DataFrame,
    train_df: pd.DataFrame,
    validation_df: pd.DataFrame,
    test_df: pd.DataFrame,
):
    # ensure train, validation, and test all have the same columns after ohe

    ohe_columns = encode_int(data_df, CATEGORICAL_FEATURE_NAMES).columns

    train_df_e, validation_df_e, test_df_e = [
        pd.DataFrame(
            encode_int(data=df, categorical_features=CATEGORICAL_FEATURE_NAMES),
            columns=ohe_columns,
        ).fillna(0)
        for df in [train_df, validation_df, test_df]
    ]

    return train_df_e, validation_df_e, test_df_e


def prepare_data(seed: int, resample_training: bool):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test_stratified(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return one_hot_encode(data_df, train_df, validation_df, test_df)


In [55]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 10


def metrics(y_pred, p_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=p_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [56]:
def run_experiment(
    train_data: pd.DataFrame,
    validation_data: pd.DataFrame,
    test_data: pd.DataFrame,
    seed: int,
):
    train_validation_data = pd.concat([train_data, validation_data]).sample(
        frac=1, random_state=seed
    )

    # split labels
    x, y = split_label(train_validation_data)
    x_test, y_test = split_label(test_data)

    # metrics
    clf = RandomForestClassifier()
    clf.fit(x, y)
    y_pred = clf.predict(x_test)
    p_pred = clf.predict_proba(x_test)[:, 1]

    return y_pred, p_pred, y_test


In [57]:
# evaluate RandomForest

results = {}
results_resample = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=False
    )

    y_pred, p_pred, y_test = run_experiment(
        train_data, validation_data, test_data, seed
    )

    results[experiment] = list(metrics(y_pred, p_pred, y_test))

    train_data, validation_data, test_data = prepare_data(
        seed=seed, resample_training=True
    )

    y_pred, p_pred, y_test = run_experiment(
        train_data, validation_data, test_data, seed
    )

    results_resample[experiment] = list(metrics(y_pred, p_pred, y_test))


results = (
    pd.DataFrame()
    .from_dict(results, orient="index", columns=RESULT_COLS)
    .assign(classifier="RandomForest")
)
results_resample = (
    pd.DataFrame()
    .from_dict(results_resample, orient="index", columns=RESULT_COLS)
    .assign(classifier="RandomForest")
)

results.to_csv(Path().resolve().joinpath("results/RandomForest_eval.csv"))
results_resample.to_csv(
    Path().resolve().joinpath("results/RandomForest_eval_resampled.csv")
)


[33539  1294 31167 ... 36997  5501  6884] [28511 39878  5908 ... 15999 14078 33378]
[33539  1294 31167 ... 36997  5501  6884] [28511 39878  5908 ... 15999 14078 33378]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[33896  8181 25616 ... 23924 18837 14779] [18787 33480 17385 ...   312  7865 34771]
[33896  8181 25616 ... 23924 18837 14779] [18787 33480 17385 ...   312  7865 34771]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])
  precision = tp / (tp + fp)


[ 4766 40449 33256 ...  2718 10070 15640] [ 5184 23196 15916 ... 32816  6643 32780]
[ 4766 40449 33256 ...  2718 10070 15640] [ 5184 23196 15916 ... 32816  6643 32780]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])
  precision = tp / (tp + fp)


[30161 15092  2279 ...  6898  3219 13593] [ 6230 28826  5416 ... 32426 40044  7421]
[30161 15092  2279 ...  6898  3219 13593] [ 6230 28826  5416 ... 32426 40044  7421]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[36287  3891 11565 ... 35838 22645 28039] [24229 34845 32285 ... 13143 35187  1507]
[36287  3891 11565 ... 35838 22645 28039] [24229 34845 32285 ... 13143 35187  1507]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])
  precision = tp / (tp + fp)


[15896 26352 10947 ... 43014 12648 43247] [21772  9919  1989 ... 19998 11752 26622]
[15896 26352 10947 ... 43014 12648 43247] [21772  9919  1989 ... 19998 11752 26622]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[ 1498  3997 33398 ... 18501    82 39511] [21407 11747  2033 ... 20516 17850  6239]
[ 1498  3997 33398 ... 18501    82 39511] [21407 11747  2033 ... 20516 17850  6239]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[31418 19492 19407 ... 38294  3179  1630] [37333 15711 17613 ... 34451 14554 22575]
[31418 19492 19407 ... 38294  3179  1630] [37333 15711 17613 ... 34451 14554 22575]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])
  precision = tp / (tp + fp)


[18696 22285 14074 ... 34089 38563 42659] [24788 34362 16766 ... 15832 29618  4805]
[18696 22285 14074 ... 34089 38563 42659] [24788 34362 16766 ... 15832 29618  4805]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[ 9373 19657 17437 ... 13558  7460 12399] [37856 35636 43120 ... 26031  1614 26293]
[ 9373 19657 17437 ... 13558  7460 12399] [37856 35636 43120 ... 26031  1614 26293]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])
  precision = tp / (tp + fp)
