# HLMA 408: ROC Curves - Tests 

***
> __Auteur__: Joseph Salmon <joseph.salmon@umontpellier.fr>

We illustrate in this notebook the notion of ROC Curves in the context of Covid-19 PCR tests.

Important naming following the classical terminology (see *e.g.,* https://en.wikipedia.org/wiki/Receiver_operating_characteristic): 
- $\mathrm {P}$ : cardinality of people positive to covid in the study (Positive)
- $\mathrm {N}$ : cardinality of people negative to covid in the study, patient samples from before 2019 (Negative)
- $\mathrm {TP}$ (True Positive)
- $\mathrm {TN}$ (True Negative)
- $\mathrm {FP}$ (False Positive)
- $\mathrm {FN}$ (False Negative)
- ${\displaystyle \mathrm {TPR} ={\frac {\mathrm {TP} }{\mathrm {P} }}={\frac {\mathrm {TP} }{\mathrm {TP} +\mathrm {FN} }}}$ (True Positive Rate)
- ${\displaystyle \mathrm {TNR} ={\frac {\mathrm {TN} }{\mathrm {N} }}={\frac {\mathrm {TN} }{\mathrm {TN} +\mathrm {FP} }}}$ (True Negative Rate)


**Sources**:
- Article: https://www.thelancet.com/action/showPdf?pii=S1473-3099%2820%2930634-4
- Data: https://figshare.com/articles/dataset/Dataset_Head-to-head_benchmark_evaluation_of_the_sensitivity_and_specificity_of_five_immunoassays_for_SARS-CoV-2_serology_on_1500_samples/12622172?backTo=/collections/Head-to-head_benchmark_evaluation_of_the_sensitivity_and_specificity_of_five_immunoassays_for_SARS-CoV-2_serology_on_1500_samples/5046032
- Data: https://ndownloader.figshare.com/files/23736068

Remark: it is hard to reproduce the article study, due to a large amount of pre-processing (the  patients extraction from the raw data requires some care, this is disregarded for the moment);
cf. page 24 https://www.thelancet.com/cms/10.1016/S1473-3099(20)30634-4/attachment/acf302a9-62fb-4680-844a-1387a081faa3/mmc1.pdf) for more information.


In [1]:
import numpy as np
from scipy import stats 
import matplotlib.pyplot as plt
import ipywidgets  # ipywidgets>=7.5
import pandas as pd
from matplotlib.cm import get_cmap
from matplotlib import gridspec

In [2]:
name = "Set1"
cmap = get_cmap(name)  # type: matplotlib.colors.ListedColormap
colors = cmap.colors  # type: list

In [3]:
pd.options.display.max_rows = 8

In [4]:
def make_box_layout():
    return ipywidgets.Layout(
        border='solid 1px black',
        margin='0px 10px 10px 0px',
        padding='5px 5px 5px 5px',
    )

In [5]:
%matplotlib widget

In [6]:
# If download needed

# from download import download

# url = "https://ndownloader.figshare.com/files/23736068"
# path_target = "./supplementary_dataset2.xlsx"
# download(url, path_target, replace=False)
# pcr_df = pd.read_excel('./supplementary_dataset2.xlsx',
#                        comment='#', na_values="n/a")
# pcr_df.to_csv('./supplementary_dataset2.csv')

pcr_df = pd.read_csv('supplementary_dataset2.csv',
                       comment='#', na_values="n/a")


In [7]:
pcr_df.drop(columns=['participantnum', 'dayssincesymptoms',
                     'dayssincepcrpos', 'diseaseseverity'], inplace=True)

pcr_df.dropna(inplace=True)
pcr_df['covid_status'] = np.where(
    pcr_df['samplecollection'] == 'obb_negatives', 'Negative', 'Positive')


pcr_df['abbott_result'].replace(
    {0: 0.005}, inplace=True)  # replace 0 by min_value/2
pcr_df['diasorin_result'].replace(
    {"<3.80": 1.9}, inplace=True)  # replace <3.8 by 1.9
pcr_df['diasorin_result']=pcr_df['diasorin_result'].astype(float)
pcr_df['siemens_result'].replace(
    {"> 10.00": 10.}, inplace=True)  # replace >10 by 10
pcr_df['siemens_result'].replace(
    {"< 0.05": 0.03}, inplace=True)  # replace <0.05 by 0.03
pcr_df["siemens_result"] = pcr_df["siemens_result"].astype(float)

pcr_df["random_result"] = 10**np.random.rand(len(pcr_df))  # uniform in logspace
pcr_df["perfect_result"] = pcr_df['covid_status'] == 'Positive'
pcr_df["perfect_result"] = pcr_df["perfect_result"].astype(int)
pcr_df["perfect_result"].replace(
    {0: 0.001}, inplace=True)  # replace 0 by 0.001 for display


df_reality_pos = pcr_df[pcr_df['covid_status'] == 'Positive']
df_reality_neg = pcr_df[pcr_df['covid_status'] == 'Negative']


dict_results_inv = {"abbott_result": "Abbott", "diasorin_result": "DiaSorin",
                    "oxford_result": "Oxford", "roche_result": "Roche", "siemens_result": "Siemens",
                    "random_result": "Random",  "perfect_result": "Ideal"}
dict_results = {"Abbott": "abbott_result", "DiaSorin": "diasorin_result",
                "Oxford": "oxford_result", "Roche": "roche_result", "Siemens": "siemens_result",
                "Random": "random_result", "Ideal": "perfect_result"}

In [8]:
def roc(thresholds, y_true, score):
    # Extracted from https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
    fpr = []  # false positive rate
    tpr = []  # true positive rate

    # get number of positive and negative examples in the dataset
    P = sum(y_true)
    N = len(y_true) - P

    # iterate through all thresholds and determine fraction of true positives
    # and false positives found at this threshold
    for thresh in thresholds:
        FP = 0
        TP = 0
        for i in range(len(score)):
            if (score[i] > thresh):
                if y_true[i] == True:
                    TP += 1
                else:
                    FP += 1
        fpr.append(FP/float(N))
        tpr.append(TP/float(P))
    return tpr, fpr

In [9]:
output = ipywidgets.Output()
test_name = 'abbott_result'

with output:
    fig = plt.figure(figsize=(8, 6), num='ROC Curve for Covid19 tests')
    gs = gridspec.GridSpec(1, 2, width_ratios=[1, 2])
    ax0 = plt.subplot(gs[0])
    ax1 = plt.subplot(gs[1])

fig.tight_layout(pad=3.0)

fig.suptitle("PCR Test name : " + dict_results_inv[test_name])
np.random.seed(123)
threshold = np.median(pcr_df[test_name])


idx_TP = df_reality_pos[test_name] >= threshold
idx_FN = df_reality_pos[test_name] < threshold
idx_FP = df_reality_neg[test_name] >= threshold
idx_TN = df_reality_neg[test_name] < threshold

n_reality_pos = len(df_reality_pos)
n_reality_neg = len(df_reality_neg)

jitter = 0.3
x_pos = jitter * (np.random.rand(n_reality_pos,)-0.5)
x_neg = 0.6 + jitter * (np.random.rand(n_reality_neg,)-0.5)

fig.canvas.toolbar_visible = False
ax0.set_yscale('log')
ylims = [pcr_df[test_name].min() / 2, pcr_df[test_name].max() * 2]
ax0.set_ylim(ylims)
ax0.set_xticks([0, 0.6])
ax0.set_xticklabels(['Postive', 'Negative'])

ax0.set_xlabel(' "True" covid status')
ax0.set_ylabel("Assay numerical result (log scale)")


pts_TP = ax0.scatter(x_pos[idx_TP], df_reality_pos[idx_TP][test_name], alpha=1., s=25, linewidth=0.1,
                     edgecolor="black", cmap="Paired", marker='o', color=colors[1], label="TP")
pts_FN = ax0.scatter(x_pos[idx_FN], df_reality_pos[idx_FN][test_name], alpha=1., s=20, linewidth=0.1,
                     cmap="Paired", marker="x", color=colors[1], label="FN")
pts_FP = ax0.scatter(x_neg[idx_FP], df_reality_neg[idx_FP][test_name], alpha=1., s=25, linewidth=0.1,
                     edgecolor="black", cmap="Paired", marker='o', color=colors[0], label="FP")
pts_TN = ax0.scatter(x_neg[idx_TN], df_reality_neg[idx_TN][test_name], alpha=1., s=20, linewidth=0.1,
                     cmap="Paired", marker="x", color=colors[0], label="TN")

xlims = ax0.axes.get_xlim()
line, = ax0.plot(xlims,  [threshold, threshold],
                 '-', color='k', label="Threshold")


handles, labels = ax0.get_legend_handles_labels()
handles_list = handles[1:] + handles[:1]
labels_list = labels[1:] + labels[:1]
ax0.legend(handles_list, labels_list, loc='upper center', bbox_to_anchor=(0.5, 1.15),
           ncol=3, fancybox=True, shadow=True, prop={'size': 8})


TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))


roc_pts, = ax1.plot(FPR, TPR, "o", color='k')
ax1.set_xlim([-0.1, 1.1])
ax1.set_ylim([-0.1, 1.1])
ax1.set_ylabel(r"$\mathrm{TPR} = \frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FN}}$")
ax1.set_xlabel(r"$\mathrm{FPR} = \frac{\mathrm{FP}}{\mathrm{TN}+\mathrm{FP}}$")
ax1.set_title("ROC curve")

Positives = pcr_df['covid_status'] == 'Positive'
tpr, fpr = roc(np.logspace(np.log10(ylims[0]), np.log10(
    ylims[1]), num=100), Positives.values, pcr_df[test_name].values)
roc_curve, = ax1.plot(fpr, tpr, "-", color='k')

offset = 50
bbox = dict(boxstyle="round", fc="0.99")
arrowprops = dict(arrowstyle="simple",
                  color="k",
                  shrinkB=5,
                  connectionstyle="arc3,rad=0.1",
                  )

disp = ax1.annotate('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR),
                    (FPR, TPR), xytext=(550, 350), textcoords='figure pixels',
                    bbox=bbox, arrowprops=arrowprops)
plt.tight_layout()
plt.show()


def update_threshold(change):
    test_name = dict_results[radiobuttons.value]
    threshold = change.new
    idx_TP = df_reality_pos[test_name] >= threshold
    idx_FN = df_reality_pos[test_name] < threshold
    idx_FP = df_reality_neg[test_name] >= threshold
    idx_TN = df_reality_neg[test_name] < threshold

    pts_TP.set_offsets(
        np.c_[x_pos[idx_TP], df_reality_pos[idx_TP][test_name]])
    pts_FN.set_offsets(
        np.c_[x_pos[idx_FN], df_reality_pos[idx_FN][test_name]])
    pts_FP.set_offsets(
        np.c_[x_neg[idx_FP], df_reality_neg[idx_FP][test_name]])
    pts_TN.set_offsets(
        np.c_[x_neg[idx_TN], df_reality_neg[idx_TN][test_name]])

    line.set_ydata([threshold, threshold])

    TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
    FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))

    disp.set_text('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR))
    disp.xy = FPR, TPR
    roc_pts.set_ydata(TPR)
    roc_pts.set_xdata(FPR)

    fig.canvas.draw()


def update_name(change):
    test_name = dict_results[change.new]

    threshold = np.median(pcr_df[test_name])
    # XXX TODO change to harmonize the upper/lower gaps.
    ylims = [pcr_df[test_name].min() / 2, pcr_df[test_name].max() * 2]
    ax0.set_ylim(ylims)

    # done to avoid bug if min>max when updating.
    threshold_slider.max = np.infty
    threshold_slider.min = np.log10(ylims[0])
    threshold_slider.max = np.log10(ylims[1])
    threshold_slider.value = np.median(pcr_df[test_name])

    fig.suptitle("PCR Test name : " + dict_results_inv[test_name])

    idx_TP = df_reality_pos[test_name] >= threshold
    idx_FN = df_reality_pos[test_name] < threshold
    idx_FP = df_reality_neg[test_name] >= threshold
    idx_TN = df_reality_neg[test_name] < threshold

    pts_TP.set_offsets(
        np.c_[x_pos[idx_TP], df_reality_pos[idx_TP][test_name]])
    pts_FN.set_offsets(
        np.c_[x_pos[idx_FN], df_reality_pos[idx_FN][test_name]])
    pts_FP.set_offsets(
        np.c_[x_neg[idx_FP], df_reality_neg[idx_FP][test_name]])
    pts_TN.set_offsets(
        np.c_[x_neg[idx_TN], df_reality_neg[idx_TN][test_name]])

    line.set_ydata([threshold, threshold])

    TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
    FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))

    disp.set_text('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR))
    disp.xy = FPR, TPR
    roc_pts.set_ydata(TPR)
    roc_pts.set_xdata(FPR)
    Positives = pcr_df['covid_status'] == 'Positive'
    tpr, fpr = roc(np.logspace(np.log10(ylims[0]), np.log10(
        ylims[1]), num=100), Positives.values, pcr_df[test_name].values)
    roc_curve.set_ydata(tpr)
    roc_curve.set_xdata(fpr)

    fig.canvas.draw()


style = {'description_width': '100px'}
layout = {'width': '90px'}

threshold_slider = ipywidgets.FloatLogSlider(
    min=np.log10(ylims[0]), max=np.log10(ylims[1]), value=np.median(pcr_df[test_name]), base=10, step=0.1, description='Threshold', style=style, layout=layout, orientation='vertical')

radiobuttons = ipywidgets.RadioButtons(
    value='Abbott',
    options=list(dict_results_inv.values()),
    description='',
    style=style, layout=layout
)

controls = ipywidgets.VBox([
    threshold_slider,
    radiobuttons
])


threshold_slider.observe(update_threshold, 'value')
radiobuttons.observe(update_name, 'value')


controls.layout = make_box_layout()
out_box = ipywidgets.Box([output])
output.layout = make_box_layout()
box = ipywidgets.HBox([controls, output])

In [10]:
box

HBox(children=(VBox(children=(FloatLogSlider(value=0.07, description='Threshold', layout=Layout(width='90px'),…