In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score, brier_score_loss
import os
import glob
import csv
import numpy as np
import re

In [2]:
def calculate_stability(group, col="predicted"):
    group["diff"] = abs(group[col].shift(-1) - group[col])
    return(group["diff"].mean(skipna=True))

In [3]:
def extract_event_nr(s):
    m = re.match(r'.*_(\d{1,2})$', s)
    if m:
        return int(m.group(1))
    else:
        return 1

In [4]:
def extract_case_id(s):
    m = re.match(r'(.*)_\d{1,2}$', s)
    if m:
        return m.group(1)
    else:
        return s

# Original

In [5]:
results_dir = "results_stability"

In [6]:
for filename in glob.glob("results_detailed/*test*"):
    print(filename)
    dt_results = pd.read_csv(filename, sep=";")
    dt_results.case_id = dt_results.case_id.astype(str)
    
    if "lstm" not in filename:
        if "single" in filename:
            dt_results["nr_events"] = dt_results.case_id.apply(extract_event_nr)
        dt_results["case_id"] = dt_results.case_id.apply(extract_case_id)
        
    dataset_name = dt_results.dataset.iloc[0]
    if "params" in dt_results.columns:
        method_name = dt_results.params.iloc[0]
    else:
        method_name = dt_results.method.iloc[0]
    cls_method = dt_results.cls.iloc[0]

    with open(os.path.join(results_dir, "results_auc_stability_%s_%s_%s.csv" % (dataset_name, method_name, cls_method)), 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';', quoting=csv.QUOTE_NONE)
        spamwriter.writerow(["dataset", "method", "cls", "nr_events", "metric", "score"])

        for nr_events, group in dt_results.groupby("nr_events"):
            auc = np.nan if len(set(group.actual)) < 2 else roc_auc_score(group.actual, group.predicted)
            spamwriter.writerow([dataset_name, method_name, cls_method, nr_events, "auc", auc])

        stab_by_case = dt_results.groupby("case_id").apply(calculate_stability)
        spamwriter.writerow([dataset_name, method_name, cls_method, -1, "stability", 1 - stab_by_case.mean()])

results_detailed/detailed_results_test_xgboost_crm2_single_index.csv
results_detailed/detailed_results_test_xgboost_calibrated_crm2_single_index.csv


# Smoothing

In [7]:
import pickle

with open("n_test_cases.pickle", "rb") as fin:
    test_cases_dict = pickle.load(fin)
    
df_test_cases = pd.DataFrame(test_cases_dict)
df_test_cases = df_test_cases.stack().reset_index()
df_test_cases.columns = ["nr_events", "dataset", "n_cases"]

In [8]:
results_dir = "results_stability_smoothed"

In [9]:
for filename in glob.glob("results_detailed/*test*"):
    print(filename)
    
    dt_results = pd.read_csv(filename, sep=";")
    dt_results.case_id = dt_results.case_id.astype(str)
    
    if "lstm" not in filename:
        if "single" in filename:
            dt_results["nr_events"] = dt_results.case_id.apply(extract_event_nr)
        dt_results["case_id"] = dt_results.case_id.apply(extract_case_id)

    dataset_name = dt_results.dataset.iloc[0]
    if "params" in dt_results.columns:
        method_name = dt_results.params.iloc[0]
        dt_results = dt_results.drop(["params"], axis=1)
    else:
        method_name = dt_results.method.iloc[0]
        dt_results = dt_results.drop(["method"], axis=1)
    cls_method = dt_results.cls	.iloc[0]

    dt_results = dt_results.drop(["dataset", "cls"], axis=1)
    dt_results.nr_events = dt_results.nr_events.astype(int)


    betas = [0, 0.1, 0.25, 0.5, 0.75, 0.9]
    smoothed_preds = dt_results[dt_results.nr_events==1]
    for beta in betas:
        smoothed_preds["smoothed_pred_%s" % beta] = smoothed_preds["predicted"]

    for nr_events in range(2, dt_results.nr_events.max()+1):
        tmp_merged = pd.merge(dt_results[dt_results.nr_events==nr_events], smoothed_preds[smoothed_preds.nr_events==(nr_events-1)].drop(["predicted", "nr_events"], axis=1), on=["case_id", "actual"])
        for beta in betas:
            tmp_merged["smoothed_pred_%s" % beta] = beta * tmp_merged["smoothed_pred_%s" % beta] + (1-beta) * tmp_merged.predicted
        smoothed_preds = pd.concat([smoothed_preds, tmp_merged], axis=0)

    with open(os.path.join(results_dir, "results_auc_stability_%s_%s_%s.csv" % (dataset_name, method_name, cls_method)), 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';', quoting=csv.QUOTE_NONE)
        spamwriter.writerow(["dataset", "method", "cls", "beta", "metric", "score"])

        for beta in betas:
            aucs = []
            for nr_events, group in smoothed_preds.groupby("nr_events"):
                auc = np.nan if len(set(group.actual)) < 2 else roc_auc_score(group.actual, group["smoothed_pred_%s" % beta])
                aucs.append((auc, dataset_name, nr_events))
                
            dt_aucs = pd.DataFrame(aucs)
            dt_aucs.columns = ["score", "dataset", "nr_events"]
            dt_aucs = pd.merge(dt_aucs, df_test_cases, on=["dataset", "nr_events"])
            dt_aucs["score"].fillna(0, inplace=True)
            auc = np.average(dt_aucs["score"], weights=dt_aucs["n_cases"])
            spamwriter.writerow([dataset_name, method_name, cls_method, beta, "auc", auc])

            stab_by_case = smoothed_preds.groupby("case_id").apply(calculate_stability, col="smoothed_pred_%s" % beta)
            spamwriter.writerow([dataset_name, method_name, cls_method, beta, "stability", 1 - stab_by_case.mean()])


results_detailed/detailed_results_test_xgboost_crm2_single_index.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


results_detailed/detailed_results_test_xgboost_calibrated_crm2_single_index.csv


# Brier scores

In [10]:
results_dir = "results_stability_brier"

In [13]:
for filename in glob.glob("results_detailed/*test*"):
    print(filename)
    dt_results = pd.read_csv(filename, sep=";")
    dt_results.case_id = dt_results.case_id.astype(str)
    
    if "lstm" not in filename:
        if "sepsis" in filename:
            dt_results.case_id = dt_results.case_id.str.replace("missing_caseid", "missing")
        max_underscores_in_caseid = dt_results.case_id.apply(lambda x: len(x.split("_"))).max()
        if "single" in filename:
            dt_results["nr_events"] = dt_results.case_id.apply(lambda x: 1 if len(x.split("_")) < max_underscores_in_caseid else x.split("_")[-1])
        dt_results["case_id"] = dt_results.case_id.apply(lambda x: x if len(x.split("_")) < max_underscores_in_caseid else x.split("_")[0])

    dataset_name = dt_results.dataset.iloc[0]
    if "params" in dt_results.columns:
        method_name = dt_results.params.iloc[0]
    else:
        method_name = dt_results.method.iloc[0]
    cls_method = dt_results.cls.iloc[0]

    with open(os.path.join(results_dir, "results_auc_stability_%s_%s_%s.csv" % (dataset_name, method_name, cls_method)), 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';', quoting=csv.QUOTE_NONE)
        spamwriter.writerow(["dataset", "method", "cls", "nr_events", "metric", "score"])

        for nr_events, group in dt_results.groupby("nr_events"):
            brier = brier_score_loss(group.actual, group.predicted)
            spamwriter.writerow([dataset_name, method_name, cls_method, nr_events, "brier", brier])
        
        brier = brier_score_loss(dt_results.actual, dt_results.predicted)
        spamwriter.writerow([dataset_name, method_name, cls_method, -1, "brier", brier])

results_detailed/detailed_results_test_xgboost_bpic2017_cancelled_single_index.csv
results_detailed/detailed_results_test_xgboost_calibrated_bpic2017_cancelled_single_index.csv
