In [24]:
import pandas as pd
import random
import sklearn

from pathlib import Path
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC


In [25]:
CATEGORICAL_FEATURE_NAMES = [
    "gender",
    "hypertension",
    "heart_disease",
    "ever_married",
    "work_type",
    "residence_type",
    "smoking_status",
]

NUMERIC_FEATURE_NAMES = ["age", "avg_glucose_level", "bmi"]


def encode_int(data: pd.DataFrame, categorical_features: list[str]):
    return pd.get_dummies(data, columns=categorical_features, drop_first=True)


def split_label(data: pd.DataFrame):
    x = data.copy().drop("stroke", axis=1)
    y = data["stroke"]  # labels

    return x, y


def resample(data: pd.DataFrame, seed: int, categorical_features: list[str]):
    """oversample positive cases with SMOTE and undersample negative with EEN"""
    # encode categorical features first
    enc = OrdinalEncoder()
    data[categorical_features] = enc.fit_transform(data[categorical_features])

    X = data.drop(columns=["stroke"], axis=1)
    Y = data["stroke"]

    cat_features_indices = [
        data.columns.get_loc(label) for label in categorical_features
    ]

    smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=seed)
    smote_een = SMOTEENN(smote=smote_nc, random_state=seed, sampling_strategy="auto")

    x_resampled, y_resampled = smote_een.fit_resample(X, Y)
    x_resampled["stroke"] = y_resampled

    x_resampled[categorical_features] = enc.inverse_transform(
        x_resampled[categorical_features]
    )

    return pd.DataFrame(x_resampled, columns=data.columns)


def scale(df):
    X_num = df[NUMERIC_FEATURE_NAMES]
    X_cat = df[CATEGORICAL_FEATURE_NAMES]

    scaler = StandardScaler()
    scaler.fit(X_num)

    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)

    df_scaled = pd.concat([X_scaled, X_cat, df["stroke"]], axis=1)

    return df_scaled


def split_train_valid_test(data_df, seed: int, resample_training: bool):
    data_df = data_df.sample(frac=1, random_state=seed)

    test_set = data_df[round(len(data_df) * 0.85) :]
    train_validation_data = data_df[: round(len(data_df) * 0.85)].sample(
        frac=1, random_state=seed
    )

    train_set = train_validation_data[: round(len(data_df) * 0.70)]
    validation_set = train_validation_data[round(len(data_df) * 0.70) :]

    if resample_training:
        train_set = resample(
            train_set, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_set, validation_set, test_set

def split_train_valid_test_stratified(data_df, seed: int, resample_training: bool):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=seed)
    d_x, d_y = split_label(data_df)

    train_index, valid_test_index = list(sss.split(d_x, d_y))[0]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    v_t_x, v_t_y = split_label(data_df.iloc[valid_test_index])

    validation_index, test_index = list(sss.split(v_t_x, v_t_y))[0]

    train_df = data_df.iloc[train_index.tolist()]
    validation_df = data_df.iloc[validation_index.tolist()]
    test_df = data_df.iloc[test_index.tolist()]

    if resample_training:
        train_set = resample(
            train_df, seed, categorical_features=CATEGORICAL_FEATURE_NAMES
        )

    return train_df, validation_df, test_df


def one_hot_encode(
    data_df: pd.DataFrame,
    train_df: pd.DataFrame,
    validation_df: pd.DataFrame,
    test_df: pd.DataFrame,
):
    # ensure train, validation, and test all have the same columns after ohe

    ohe_columns = encode_int(data_df, CATEGORICAL_FEATURE_NAMES).columns

    train_df_e, validation_df_e, test_df_e = [
        pd.DataFrame(
            encode_int(data=df, categorical_features=CATEGORICAL_FEATURE_NAMES),
            columns=ohe_columns,
        ).fillna(0)
        for df in [train_df, validation_df, test_df]
    ]

    return train_df_e, validation_df_e, test_df_e


def prepare_data(seed: int, resample_training: bool):
    data_df = pd.read_csv(Path().resolve().joinpath("dataset/full_data_clean.csv"))

    train_df, validation_df, test_df = split_train_valid_test_stratified(
        data_df, seed, resample_training
    )

    train_df, validation_df, test_df = [
        scale(df) for df in [train_df, validation_df, test_df]
    ]

    return one_hot_encode(data_df, train_df, validation_df, test_df)


In [26]:
RESULT_COLS = [
    "precision",
    "recall",
    "fscore",
    "accuracy",
    "auc",
    "miss_rate",
    "fall_out_rate",
]
NUM_EXPERIMENTS = 10
NUM_ROUNDS = 20


def metrics(y_pred, p_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_pred=y_pred, y_true=y_true).ravel()

    # metrics
    auc = roc_auc_score(y_true=y_true, y_score=p_pred)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * tp / (2 * tp + fp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    miss_rate = fn / (tn + tp)
    fall_out_rate = fp / (fp + tn)

    # return
    return [precision, recall, fscore, accuracy, auc, miss_rate, fall_out_rate]


In [27]:
def run_experiment(seed, experiment, train_data, validation_data, test_data):
    # split labels
    train_validation = pd.concat([train_data, validation_data]).sample(
        frac=1, random_state=seed
    )

    x, y = split_label(train_validation)
    x_test, y_test = split_label(test_data)

    # metrics
    param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}

    es = EarlyStopping(
        rounds=3,
        save_best=True,
        maximize=True,
        data_name="validation_0",
        metric_name="auc",
    )

    clf = xgb.sklearn.XGBClassifier(
        learning_rate=0.1,
        objective=param["objective"],
        gpu_id=0,
        predictor="gpu_predictor",
    )

    clf.fit(x, y, eval_set=[(x_test, y_test), (x, y)])

    y_pred = clf.predict(x_test)
    p_pred = clf.predict_proba(x_test)[:, 1]

    return y_pred, p_pred, y_test


In [28]:
# evaluate xgboost

results = {}
results_resample = {}

for experiment in range(NUM_EXPERIMENTS):
    seed = random.randint(0, 1000)

    train_data, validation_data, test_data = prepare_data(
        seed=experiment, resample_training=False
    )

    y_pred, p_pred, y_test = run_experiment(
        seed, experiment, train_data, validation_data, test_data
    )

    results[experiment] = list(metrics(y_pred, p_pred, y_test))

    train_data, validation_data, test_data = prepare_data(
        seed=experiment, resample_training=True
    )

    y_pred, p_pred, y_test = run_experiment(
        seed, experiment, train_data, validation_data, test_data
    )

    results_resample[experiment] = list(metrics(y_pred, p_pred, y_test))



[0]	validation_0-logloss:0.60456	validation_1-logloss:0.60441
[1]	validation_0-logloss:0.53207	validation_1-logloss:0.53173
[2]	validation_0-logloss:0.47171	validation_1-logloss:0.47119
[3]	validation_0-logloss:0.42083	validation_1-logloss:0.42013
[4]	validation_0-logloss:0.37750	validation_1-logloss:0.37662
[5]	validation_0-logloss:0.34031	validation_1-logloss:0.33926
[6]	validation_0-logloss:0.30823	validation_1-logloss:0.30695
[7]	validation_0-logloss:0.28032	validation_1-logloss:0.27888
[8]	validation_0-logloss:0.25599	validation_1-logloss:0.25435
[9]	validation_0-logloss:0.23467	validation_1-logloss:0.23292
[10]	validation_0-logloss:0.21599	validation_1-logloss:0.21406
[11]	validation_0-logloss:0.19955	validation_1-logloss:0.19748
[12]	validation_0-logloss:0.18506	validation_1-logloss:0.18281
[13]	validation_0-logloss:0.17232	validation_1-logloss:0.16980
[14]	validation_0-logloss:0.16107	validation_1-logloss:0.15830
[15]	validation_0-logloss:0.15109	validation_1-logloss:0.14806
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60472	validation_1-logloss:0.60442
[1]	validation_0-logloss:0.53234	validation_1-logloss:0.53177
[2]	validation_0-logloss:0.47209	validation_1-logloss:0.47128
[3]	validation_0-logloss:0.42128	validation_1-logloss:0.42024
[4]	validation_0-logloss:0.37803	validation_1-logloss:0.37677
[5]	validation_0-logloss:0.34090	validation_1-logloss:0.33942
[6]	validation_0-logloss:0.30881	validation_1-logloss:0.30718
[7]	validation_0-logloss:0.28102	validation_1-logloss:0.27914
[8]	validation_0-logloss:0.25669	validation_1-logloss:0.25470
[9]	validation_0-logloss:0.23549	validation_1-logloss:0.23329
[10]	validation_0-logloss:0.21692	validation_1-logloss:0.21449
[11]	validation_0-logloss:0.20062	validation_1-logloss:0.19795
[12]	validation_0-logloss:0.18611	validation_1-logloss:0.18336
[13]	validation_0-logloss:0.17330	validation_1-logloss:0.17044
[14]	validation_0-logloss:0.16199	validation_1-logloss:0.15902
[15]	validation_0-logloss:0.15199	validation_1-logloss:0.14890
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60415	validation_1-logloss:0.60467
[1]	validation_0-logloss:0.53122	validation_1-logloss:0.53222
[2]	validation_0-logloss:0.47044	validation_1-logloss:0.47188
[3]	validation_0-logloss:0.41919	validation_1-logloss:0.42097
[4]	validation_0-logloss:0.37549	validation_1-logloss:0.37762
[5]	validation_0-logloss:0.33796	validation_1-logloss:0.34037
[6]	validation_0-logloss:0.30550	validation_1-logloss:0.30816
[7]	validation_0-logloss:0.27727	validation_1-logloss:0.28019
[8]	validation_0-logloss:0.25261	validation_1-logloss:0.25579
[9]	validation_0-logloss:0.23102	validation_1-logloss:0.23441
[10]	validation_0-logloss:0.21205	validation_1-logloss:0.21562
[11]	validation_0-logloss:0.19534	validation_1-logloss:0.19907
[12]	validation_0-logloss:0.18057	validation_1-logloss:0.18445
[13]	validation_0-logloss:0.16751	validation_1-logloss:0.17149
[14]	validation_0-logloss:0.15598	validation_1-logloss:0.16005
[15]	validation_0-logloss:0.14578	validation_1-logloss:0.14986
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60440	validation_1-logloss:0.60465
[1]	validation_0-logloss:0.53171	validation_1-logloss:0.53220
[2]	validation_0-logloss:0.47109	validation_1-logloss:0.47189
[3]	validation_0-logloss:0.42003	validation_1-logloss:0.42101
[4]	validation_0-logloss:0.37661	validation_1-logloss:0.37766
[5]	validation_0-logloss:0.33938	validation_1-logloss:0.34046
[6]	validation_0-logloss:0.30720	validation_1-logloss:0.30828
[7]	validation_0-logloss:0.27913	validation_1-logloss:0.28033
[8]	validation_0-logloss:0.25462	validation_1-logloss:0.25597
[9]	validation_0-logloss:0.23313	validation_1-logloss:0.23464
[10]	validation_0-logloss:0.21433	validation_1-logloss:0.21592
[11]	validation_0-logloss:0.19767	validation_1-logloss:0.19946
[12]	validation_0-logloss:0.18293	validation_1-logloss:0.18488
[13]	validation_0-logloss:0.16996	validation_1-logloss:0.17199
[14]	validation_0-logloss:0.15840	validation_1-logloss:0.16061
[15]	validation_0-logloss:0.14820	validation_1-logloss:0.15050
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60458	validation_1-logloss:0.60448
[1]	validation_0-logloss:0.53206	validation_1-logloss:0.53189
[2]	validation_0-logloss:0.47172	validation_1-logloss:0.47144
[3]	validation_0-logloss:0.42082	validation_1-logloss:0.42042
[4]	validation_0-logloss:0.37748	validation_1-logloss:0.37692
[5]	validation_0-logloss:0.34027	validation_1-logloss:0.33956
[6]	validation_0-logloss:0.30811	validation_1-logloss:0.30728
[7]	validation_0-logloss:0.28022	validation_1-logloss:0.27924
[8]	validation_0-logloss:0.25588	validation_1-logloss:0.25475
[9]	validation_0-logloss:0.23451	validation_1-logloss:0.23329
[10]	validation_0-logloss:0.21579	validation_1-logloss:0.21444
[11]	validation_0-logloss:0.19928	validation_1-logloss:0.19783
[12]	validation_0-logloss:0.18478	validation_1-logloss:0.18318
[13]	validation_0-logloss:0.17193	validation_1-logloss:0.17023
[14]	validation_0-logloss:0.16054	validation_1-logloss:0.15876
[15]	validation_0-logloss:0.15044	validation_1-logloss:0.14857
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60481	validation_1-logloss:0.60449
[1]	validation_0-logloss:0.53239	validation_1-logloss:0.53190
[2]	validation_0-logloss:0.47207	validation_1-logloss:0.47146
[3]	validation_0-logloss:0.42114	validation_1-logloss:0.42046
[4]	validation_0-logloss:0.37793	validation_1-logloss:0.37702
[5]	validation_0-logloss:0.34085	validation_1-logloss:0.33969
[6]	validation_0-logloss:0.30869	validation_1-logloss:0.30744
[7]	validation_0-logloss:0.28070	validation_1-logloss:0.27940
[8]	validation_0-logloss:0.25639	validation_1-logloss:0.25498
[9]	validation_0-logloss:0.23504	validation_1-logloss:0.23357
[10]	validation_0-logloss:0.21635	validation_1-logloss:0.21480
[11]	validation_0-logloss:0.19992	validation_1-logloss:0.19825
[12]	validation_0-logloss:0.18536	validation_1-logloss:0.18366
[13]	validation_0-logloss:0.17250	validation_1-logloss:0.17071
[14]	validation_0-logloss:0.16123	validation_1-logloss:0.15928
[15]	validation_0-logloss:0.15116	validation_1-logloss:0.14914
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60447	validation_1-logloss:0.60448
[1]	validation_0-logloss:0.53188	validation_1-logloss:0.53187
[2]	validation_0-logloss:0.47147	validation_1-logloss:0.47138
[3]	validation_0-logloss:0.42046	validation_1-logloss:0.42032
[4]	validation_0-logloss:0.37704	validation_1-logloss:0.37679
[5]	validation_0-logloss:0.33974	validation_1-logloss:0.33944
[6]	validation_0-logloss:0.30758	validation_1-logloss:0.30713
[7]	validation_0-logloss:0.27959	validation_1-logloss:0.27906
[8]	validation_0-logloss:0.25518	validation_1-logloss:0.25456
[9]	validation_0-logloss:0.23385	validation_1-logloss:0.23309
[10]	validation_0-logloss:0.21513	validation_1-logloss:0.21425
[11]	validation_0-logloss:0.19863	validation_1-logloss:0.19766
[12]	validation_0-logloss:0.18405	validation_1-logloss:0.18301
[13]	validation_0-logloss:0.17123	validation_1-logloss:0.17003
[14]	validation_0-logloss:0.15989	validation_1-logloss:0.15853
[15]	validation_0-logloss:0.14980	validation_1-logloss:0.14833
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60451	validation_1-logloss:0.60454
[1]	validation_0-logloss:0.53216	validation_1-logloss:0.53198
[2]	validation_0-logloss:0.47195	validation_1-logloss:0.47153
[3]	validation_0-logloss:0.42093	validation_1-logloss:0.42058
[4]	validation_0-logloss:0.37762	validation_1-logloss:0.37711
[5]	validation_0-logloss:0.34047	validation_1-logloss:0.33982
[6]	validation_0-logloss:0.30848	validation_1-logloss:0.30756
[7]	validation_0-logloss:0.28049	validation_1-logloss:0.27955
[8]	validation_0-logloss:0.25635	validation_1-logloss:0.25513
[9]	validation_0-logloss:0.23488	validation_1-logloss:0.23370
[10]	validation_0-logloss:0.21605	validation_1-logloss:0.21488
[11]	validation_0-logloss:0.19962	validation_1-logloss:0.19832
[12]	validation_0-logloss:0.18513	validation_1-logloss:0.18372
[13]	validation_0-logloss:0.17227	validation_1-logloss:0.17080
[14]	validation_0-logloss:0.16099	validation_1-logloss:0.15936
[15]	validation_0-logloss:0.15090	validation_1-logloss:0.14925
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60424	validation_1-logloss:0.60458
[1]	validation_0-logloss:0.53149	validation_1-logloss:0.53207
[2]	validation_0-logloss:0.47082	validation_1-logloss:0.47167
[3]	validation_0-logloss:0.41967	validation_1-logloss:0.42068
[4]	validation_0-logloss:0.37609	validation_1-logloss:0.37723
[5]	validation_0-logloss:0.33868	validation_1-logloss:0.33992
[6]	validation_0-logloss:0.30635	validation_1-logloss:0.30766
[7]	validation_0-logloss:0.27826	validation_1-logloss:0.27968
[8]	validation_0-logloss:0.25377	validation_1-logloss:0.25523
[9]	validation_0-logloss:0.23229	validation_1-logloss:0.23379
[10]	validation_0-logloss:0.21338	validation_1-logloss:0.21496
[11]	validation_0-logloss:0.19682	validation_1-logloss:0.19841
[12]	validation_0-logloss:0.18216	validation_1-logloss:0.18382
[13]	validation_0-logloss:0.16919	validation_1-logloss:0.17085
[14]	validation_0-logloss:0.15771	validation_1-logloss:0.15933
[15]	validation_0-logloss:0.14746	validation_1-logloss:0.14912
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60458	validation_1-logloss:0.60458
[1]	validation_0-logloss:0.53201	validation_1-logloss:0.53207
[2]	validation_0-logloss:0.47157	validation_1-logloss:0.47168
[3]	validation_0-logloss:0.42059	validation_1-logloss:0.42074
[4]	validation_0-logloss:0.37712	validation_1-logloss:0.37735
[5]	validation_0-logloss:0.33978	validation_1-logloss:0.34011
[6]	validation_0-logloss:0.30741	validation_1-logloss:0.30787
[7]	validation_0-logloss:0.27939	validation_1-logloss:0.27989
[8]	validation_0-logloss:0.25483	validation_1-logloss:0.25548
[9]	validation_0-logloss:0.23339	validation_1-logloss:0.23411
[10]	validation_0-logloss:0.21469	validation_1-logloss:0.21535
[11]	validation_0-logloss:0.19815	validation_1-logloss:0.19883
[12]	validation_0-logloss:0.18361	validation_1-logloss:0.18426
[13]	validation_0-logloss:0.17074	validation_1-logloss:0.17138
[14]	validation_0-logloss:0.15936	validation_1-logloss:0.16000
[15]	validation_0-logloss:0.14924	validation_1-logloss:0.14985
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60444	validation_1-logloss:0.60462
[1]	validation_0-logloss:0.53175	validation_1-logloss:0.53215
[2]	validation_0-logloss:0.47142	validation_1-logloss:0.47178
[3]	validation_0-logloss:0.42031	validation_1-logloss:0.42087
[4]	validation_0-logloss:0.37699	validation_1-logloss:0.37750
[5]	validation_0-logloss:0.33978	validation_1-logloss:0.34024
[6]	validation_0-logloss:0.30757	validation_1-logloss:0.30803
[7]	validation_0-logloss:0.27960	validation_1-logloss:0.28005
[8]	validation_0-logloss:0.25517	validation_1-logloss:0.25567
[9]	validation_0-logloss:0.23373	validation_1-logloss:0.23433
[10]	validation_0-logloss:0.21487	validation_1-logloss:0.21561
[11]	validation_0-logloss:0.19841	validation_1-logloss:0.19908
[12]	validation_0-logloss:0.18366	validation_1-logloss:0.18453
[13]	validation_0-logloss:0.17068	validation_1-logloss:0.17167
[14]	validation_0-logloss:0.15916	validation_1-logloss:0.16029
[15]	validation_0-logloss:0.14908	validation_1-logloss:0.15018
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60466	validation_1-logloss:0.60442
[1]	validation_0-logloss:0.53220	validation_1-logloss:0.53176
[2]	validation_0-logloss:0.47186	validation_1-logloss:0.47126
[3]	validation_0-logloss:0.42098	validation_1-logloss:0.42022
[4]	validation_0-logloss:0.37766	validation_1-logloss:0.37672
[5]	validation_0-logloss:0.34046	validation_1-logloss:0.33938
[6]	validation_0-logloss:0.30826	validation_1-logloss:0.30705
[7]	validation_0-logloss:0.28031	validation_1-logloss:0.27899
[8]	validation_0-logloss:0.25597	validation_1-logloss:0.25451
[9]	validation_0-logloss:0.23461	validation_1-logloss:0.23307
[10]	validation_0-logloss:0.21575	validation_1-logloss:0.21421
[11]	validation_0-logloss:0.19923	validation_1-logloss:0.19762
[12]	validation_0-logloss:0.18464	validation_1-logloss:0.18300
[13]	validation_0-logloss:0.17171	validation_1-logloss:0.17005
[14]	validation_0-logloss:0.16025	validation_1-logloss:0.15859
[15]	validation_0-logloss:0.15011	validation_1-logloss:0.14845
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60472	validation_1-logloss:0.60447
[1]	validation_0-logloss:0.53237	validation_1-logloss:0.53185
[2]	validation_0-logloss:0.47207	validation_1-logloss:0.47140
[3]	validation_0-logloss:0.42128	validation_1-logloss:0.42038
[4]	validation_0-logloss:0.37800	validation_1-logloss:0.37695
[5]	validation_0-logloss:0.34084	validation_1-logloss:0.33967
[6]	validation_0-logloss:0.30880	validation_1-logloss:0.30742
[7]	validation_0-logloss:0.28095	validation_1-logloss:0.27944
[8]	validation_0-logloss:0.25662	validation_1-logloss:0.25499
[9]	validation_0-logloss:0.23544	validation_1-logloss:0.23359
[10]	validation_0-logloss:0.21676	validation_1-logloss:0.21479
[11]	validation_0-logloss:0.20040	validation_1-logloss:0.19827
[12]	validation_0-logloss:0.18594	validation_1-logloss:0.18367
[13]	validation_0-logloss:0.17320	validation_1-logloss:0.17077
[14]	validation_0-logloss:0.16185	validation_1-logloss:0.15931
[15]	validation_0-logloss:0.15189	validation_1-logloss:0.14921
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60457	validation_1-logloss:0.60439
[1]	validation_0-logloss:0.53210	validation_1-logloss:0.53175
[2]	validation_0-logloss:0.47176	validation_1-logloss:0.47125
[3]	validation_0-logloss:0.42087	validation_1-logloss:0.42020
[4]	validation_0-logloss:0.37754	validation_1-logloss:0.37671
[5]	validation_0-logloss:0.34036	validation_1-logloss:0.33937
[6]	validation_0-logloss:0.30824	validation_1-logloss:0.30706
[7]	validation_0-logloss:0.28035	validation_1-logloss:0.27899
[8]	validation_0-logloss:0.25598	validation_1-logloss:0.25448
[9]	validation_0-logloss:0.23468	validation_1-logloss:0.23302
[10]	validation_0-logloss:0.21596	validation_1-logloss:0.21415
[11]	validation_0-logloss:0.19944	validation_1-logloss:0.19751
[12]	validation_0-logloss:0.18497	validation_1-logloss:0.18284
[13]	validation_0-logloss:0.17218	validation_1-logloss:0.16986
[14]	validation_0-logloss:0.16088	validation_1-logloss:0.15837
[15]	validation_0-logloss:0.15081	validation_1-logloss:0.14816
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60474	validation_1-logloss:0.60444
[1]	validation_0-logloss:0.53248	validation_1-logloss:0.53183
[2]	validation_0-logloss:0.47223	validation_1-logloss:0.47133
[3]	validation_0-logloss:0.42143	validation_1-logloss:0.42028
[4]	validation_0-logloss:0.37812	validation_1-logloss:0.37681
[5]	validation_0-logloss:0.34095	validation_1-logloss:0.33948
[6]	validation_0-logloss:0.30891	validation_1-logloss:0.30720
[7]	validation_0-logloss:0.28100	validation_1-logloss:0.27916
[8]	validation_0-logloss:0.25672	validation_1-logloss:0.25471
[9]	validation_0-logloss:0.23547	validation_1-logloss:0.23333
[10]	validation_0-logloss:0.21679	validation_1-logloss:0.21451
[11]	validation_0-logloss:0.20042	validation_1-logloss:0.19795
[12]	validation_0-logloss:0.18593	validation_1-logloss:0.18331
[13]	validation_0-logloss:0.17314	validation_1-logloss:0.17038
[14]	validation_0-logloss:0.16186	validation_1-logloss:0.15895
[15]	validation_0-logloss:0.15193	validation_1-logloss:0.14880
[1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60438	validation_1-logloss:0.60462
[1]	validation_0-logloss:0.53168	validation_1-logloss:0.53214
[2]	validation_0-logloss:0.47112	validation_1-logloss:0.47178
[3]	validation_0-logloss:0.41989	validation_1-logloss:0.42088
[4]	validation_0-logloss:0.37644	validation_1-logloss:0.37749
[5]	validation_0-logloss:0.33899	validation_1-logloss:0.34025
[6]	validation_0-logloss:0.30657	validation_1-logloss:0.30804
[7]	validation_0-logloss:0.27844	validation_1-logloss:0.28009
[8]	validation_0-logloss:0.25386	validation_1-logloss:0.25571
[9]	validation_0-logloss:0.23238	validation_1-logloss:0.23437
[10]	validation_0-logloss:0.21351	validation_1-logloss:0.21565
[11]	validation_0-logloss:0.19688	validation_1-logloss:0.19910
[12]	validation_0-logloss:0.18220	validation_1-logloss:0.18456
[13]	validation_0-logloss:0.16923	validation_1-logloss:0.17168
[14]	validation_0-logloss:0.15763	validation_1-logloss:0.16026
[15]	validation_0-logloss:0.14740	validation_1-logloss:0.15014
[1

  precision = tp / (tp + fp)


[0]	validation_0-logloss:0.60446	validation_1-logloss:0.60441
[1]	validation_0-logloss:0.53195	validation_1-logloss:0.53176
[2]	validation_0-logloss:0.47154	validation_1-logloss:0.47127
[3]	validation_0-logloss:0.42061	validation_1-logloss:0.42026
[4]	validation_0-logloss:0.37722	validation_1-logloss:0.37676
[5]	validation_0-logloss:0.34000	validation_1-logloss:0.33940
[6]	validation_0-logloss:0.30776	validation_1-logloss:0.30712
[7]	validation_0-logloss:0.27983	validation_1-logloss:0.27906
[8]	validation_0-logloss:0.25545	validation_1-logloss:0.25457
[9]	validation_0-logloss:0.23408	validation_1-logloss:0.23311
[10]	validation_0-logloss:0.21532	validation_1-logloss:0.21427
[11]	validation_0-logloss:0.19885	validation_1-logloss:0.19768
[12]	validation_0-logloss:0.18430	validation_1-logloss:0.18298
[13]	validation_0-logloss:0.17142	validation_1-logloss:0.16998
[14]	validation_0-logloss:0.16002	validation_1-logloss:0.15844
[15]	validation_0-logloss:0.14991	validation_1-logloss:0.14823
[1

  precision = tp / (tp + fp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[categorical_features] = enc.fit_transform(data[categorical_features])


[0]	validation_0-logloss:0.60457	validation_1-logloss:0.60449
[1]	validation_0-logloss:0.53208	validation_1-logloss:0.53190
[2]	validation_0-logloss:0.47205	validation_1-logloss:0.47144
[3]	validation_0-logloss:0.42112	validation_1-logloss:0.42045
[4]	validation_0-logloss:0.37776	validation_1-logloss:0.37701
[5]	validation_0-logloss:0.34055	validation_1-logloss:0.33972
[6]	validation_0-logloss:0.30861	validation_1-logloss:0.30748
[7]	validation_0-logloss:0.28069	validation_1-logloss:0.27949
[8]	validation_0-logloss:0.25632	validation_1-logloss:0.25505
[9]	validation_0-logloss:0.23501	validation_1-logloss:0.23365
[10]	validation_0-logloss:0.21633	validation_1-logloss:0.21487
[11]	validation_0-logloss:0.19996	validation_1-logloss:0.19834
[12]	validation_0-logloss:0.18550	validation_1-logloss:0.18373
[13]	validation_0-logloss:0.17272	validation_1-logloss:0.17081
[14]	validation_0-logloss:0.16141	validation_1-logloss:0.15933
[15]	validation_0-logloss:0.15141	validation_1-logloss:0.14908
[1

  precision = tp / (tp + fp)


In [29]:
results = (
    pd.DataFrame()
    .from_dict(results, orient="index", columns=RESULT_COLS)
    .assign(classifier="XGBoost")
)
results_resample = (
    pd.DataFrame()
    .from_dict(results_resample, orient="index", columns=RESULT_COLS)
    .assign(classifier="XGBoost")
)


In [30]:
results.to_csv(Path().resolve().joinpath("results/XGBoost_eval.csv"))
results_resample.to_csv(Path().resolve().joinpath("results/XGBoost_eval_resampled.csv"))
