# Predict

In [None]:
import csv
from pathlib import Path

import numpy as np
from PIL import Image
import tensorflow as tf

tfk = tf.keras
tfkl = tfk.layers


def preprocess_img(filename):
    """Process an image in the same way that was done for training."""
    size = (380, 380)
    img = Image.open(filename)
    img = img.convert('RGB')
    img = img.resize(size=size, resample=Image.LANCZOS)
    # Transform to [-1, 1]
    img = np.asarray(img) / 127.5 - 1.0
    return img.astype(np.float32)

In [None]:
# Preprocess all images in testing set.
files = list(Path("../TEST_blind/TEST(blind)").glob("*.png"))
x = np.stack(list(map(preprocess_img, files)))

In [None]:
# Map the SavedModels to CSV files that will contain results of inference.
mapping = {
    "savedmodels/augment": "outputs/augment/inference.csv",
    "savedmodels/no-augment": "outputs/no-augment/inference.csv",
    "savedmodels/augment-noise": "outputs/augment-noise/inference.csv",
}

for savedmodel, output in mapping.items():
    print(f"++ Running inference on {savedmodel} and saving to {output}")
    
    Path(output).parents[0].mkdir(parents=True, exist_ok=False)

    model = tf.keras.models.load_model(savedmodel, compile=False)
    y_probs = model.predict(x, batch_size=8, verbose=True)
    y_probs = y_probs.flatten()
    y_preds = (y_probs > 0.5).astype(np.int32)
    # y_preds = y_probs.argmax(-1)  # GBM = 0, PCNSL = 1
    
    y_preds_str = map(lambda p: "gbm" if p == 0 else "pcnsl", y_preds)
    filenames = map(lambda p: p.name, files)
    # rows = list(zip(filenames, y_preds_str, y_probs[:, 0], y_probs[:, 1]))
    rows = list(zip(filenames, y_preds_str, 1 - y_probs, y_probs))
    rows.insert(0, ("filename", "prediction", "prob_gbm", "prob_pcnsl"))

    with open(output, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(rows)

# ROC curves

https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py

AUC confidence intervals https://stackoverflow.com/a/19132400/5666087

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics

In [None]:
def auc_bootstrap(y_true, y_pred, n_bootstraps, seed=None):
    # With help from https://stackoverflow.com/a/19132400/5666087
    bootstrapped_aucs = np.empty(n_bootstraps)
    prng = np.random.RandomState(seed)
    for i in range(n_bootstraps):
        indices = prng.randint(0, y_pred.shape[0], y_pred.shape[0])
        if len(np.unique(y_true[indices])) < 2:
            continue
        bootstrapped_aucs[i] = metrics.roc_auc_score(
            y_true[indices], y_pred[indices])
        print(f"{round((i + 1) / n_bootstraps * 100, 2)} % completed bootstrapping", end="\r")
    print()
    bootstrapped_aucs.sort()
    return bootstrapped_aucs

def plot_roc(y_true, y_pred, positive_class, n_bootstraps=10000, seed=None):
    fpr, tpr, _ = metrics.roc_curve(y_true=y_true, y_score=y_score)
    
    aucs = auc_bootstrap(y_true, y_score, n_bootstraps=n_bootstraps, seed=seed)
    roc_auc = aucs.mean()
    confidence_95 = aucs[int(0.025 * aucs.shape[0])], aucs[int(0.975 * aucs.shape[0])]

    fig = plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='black', lw=lw)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC for GBM vs PCNSL ({positive_class} is positive class)')
    
    print(f"ROC curve (area = {roc_auc:0.02f}")
    print(f"95% CI = {confidence_95[0]:0.2f} - {confidence_95[1]:0.2f}")
    print(n_bootstraps, "bootstraps")
    
    return fig, roc_auc, confidence_95

In [None]:
N_BOOTSTRAPS = 10000
SEED = 42

for prediction_file in mapping.values():
    
    print("\n++ Calculating metrics for", prediction_file)
    
    prediction_file = Path(prediction_file)
    save_dir = prediction_file.parents[0]
    
    df = pd.read_excel("PCNSL_GBM_Predictions.xlsx", sheet_name=2, index_col='filename')
    df_probs = pd.read_csv(prediction_file, index_col="filename")
    df_probs.loc[:, "class"] = df.loc[:, "class"]
    del df  # To be sure we don't reference this by accident.
    df_probs.head()

    # PCNSL == 1
    print("++ PCNSL == 1")
    y_true = (df_probs.loc[:, 'class'] == 'pcnsl').astype(int)
    y_score = df_probs.loc[:, 'prob_pcnsl']
    fig, roc_auc, confidence_95 = plot_roc(
        y_true, y_score, "PCNSL", n_bootstraps=N_BOOTSTRAPS, seed=SEED)
    fig.savefig(save_dir / "pcnsl_roc_curve.pdf")
    with (save_dir / "pcnsl_metrics.txt").open("w") as f:
        print("PCNSL results")
        print(f"ROC AUC = {roc_auc}", file=f)
        print(f"95% CI = {confidence_95[0]:0.2f} - {confidence_95[1]:0.2f}", file=f)
        print(f"Using {N_BOOTSTRAPS:,d} bootstraps", file=f)
        print(file=f)
        print(metrics.classification_report(y_true, y_score > 0.5, target_names=["GBM", "PCNSL"]), file=f)
        

    # GBM == 1
    print("++ GBM == 1")
    y_true = (df_probs.loc[:, 'class'] == 'gbm').astype(int)
    y_score = df_probs.loc[:, 'prob_gbm']
    fig, roc_auc, confidence_95 = plot_roc(
        y_true, y_score, "GBM", n_bootstraps=N_BOOTSTRAPS, seed=SEED)
    fig.savefig(save_dir / "gbm_roc_curve.pdf")
    with (save_dir / "gbm_metrics.txt").open("w") as f:
        print("GBM results", file=f)
        print(f"ROC AUC = {roc_auc}", file=f)
        print(f"95% CI = {confidence_95[0]:0.2f} - {confidence_95[1]:0.2f}", file=f)
        print(f"Using {N_BOOTSTRAPS:,d} bootstraps", file=f)
        print(file=f)
        print(metrics.classification_report(y_true, y_score > 0.5, target_names=["PCNSL", "GBM"]), file=f)