# HLMA 408: ROC Curves - Tests 

***
> __Auteur__: Joseph Salmon <joseph.salmon@umontpellier.fr>

We illustrate in this notebook the notion of ROC Curves in the context of Covid-19 PCR tests.

Important naming following the classical terminology (see *e.g.,* https://en.wikipedia.org/wiki/Receiver_operating_characteristic): 
- $\mathrm {P}$ : cardinality of people positive to covid in the study (Positive)
- $\mathrm {N}$ : cardinality of people negative to covid in the study, patient samples from before 2019 (Negative)
- $\mathrm {TP}$ (True Positive)
- $\mathrm {TN}$ (True Negative)
- $\mathrm {FP}$ (False Positive)
- $\mathrm {FN}$ (False Negative)
- ${\displaystyle \mathrm {TPR} ={\frac {\mathrm {TP} }{\mathrm {P} }}={\frac {\mathrm {TP} }{\mathrm {TP} +\mathrm {FN} }}}$ (True Positive Rate)
- ${\displaystyle \mathrm {TNR} ={\frac {\mathrm {TN} }{\mathrm {N} }}={\frac {\mathrm {TN} }{\mathrm {TN} +\mathrm {FP} }}}$ (True Negative Rate)


**Sources**:
- Article: https://www.thelancet.com/action/showPdf?pii=S1473-3099%2820%2930634-4
- Data: https://figshare.com/articles/dataset/Dataset_Head-to-head_benchmark_evaluation_of_the_sensitivity_and_specificity_of_five_immunoassays_for_SARS-CoV-2_serology_on_1500_samples/12622172?backTo=/collections/Head-to-head_benchmark_evaluation_of_the_sensitivity_and_specificity_of_five_immunoassays_for_SARS-CoV-2_serology_on_1500_samples/5046032
- Data: https://ndownloader.figshare.com/files/23736068

Remark: it is hard to reproduce the article study, due to a large amount of pre-processing (the  patients extraction from the raw data requires some care, this is disregarded for the moment);
cf. page 24 https://www.thelancet.com/cms/10.1016/S1473-3099(20)30634-4/attachment/acf302a9-62fb-4680-844a-1387a081faa3/mmc1.pdf) for more information.


In [1]:
import numpy as np
from scipy import stats 
import matplotlib.pyplot as plt
import ipywidgets  # ipywidgets>=7.5
import pandas as pd
from matplotlib.cm import get_cmap
from matplotlib import gridspec

In [2]:
name = "Set1"
cmap = get_cmap(name)  # type: matplotlib.colors.ListedColormap
colors = cmap.colors  # type: list


In [3]:
pd.options.display.max_rows = 8

In [4]:
def make_box_layout():
    return ipywidgets.Layout(
        border='solid 1px black',
        margin='0px 10px 10px 0px',
        padding='5px 5px 5px 5px',
    )

In [5]:
%matplotlib widget

In [6]:
# If download needed

from download import download

url = "https://ndownloader.figshare.com/files/23736068"
path_target = "./supplementary_dataset2.xlsx"
download(url, path_target, replace=False)

Replace is False and data exists, so doing nothing. Use replace=True to re-download the data.


'./supplementary_dataset2.xlsx'

In [7]:
pcr_df = pd.read_excel('supplementary_dataset2.xlsx',
                           comment='#', na_values="n/a")

In [8]:
pcr_df.head(24)

Unnamed: 0,samplebarcode,samplecollection,participantnum,dayssincesymptoms,dayssincepcrpos,diseaseseverity,platformorder,sample_type,abbott_result,abbott_interpretation,diasorin_result,diasorin_interpretation,roche_result,roche_interpretation,siemens_result,siemens_interpretation,label,oxford_result,oxford_interpretation
0,500001,oxford_positives,5b58e040ee35f3bcc6023fb7836c842e,12.0,2.0,death,diasorin_abbott,plasma,0.02,NOT detected,<3.80,NOT detected,0.074,NOT detected,0.1,NOT detected,500001,2738601.0,NOT detected
1,500002,oxford_positives,0e4033b8c0b56afbea35dc749ced4e1d,5.0,2.0,mild,diasorin_abbott,plasma,0.01,NOT detected,<3.80,NOT detected,0.065,NOT detected,< 0.05,NOT detected,500002,768641.0,NOT detected
2,500003,oxford_positives,d19d62a873f08af0488f0df720cfd293,9.0,5.0,severe,diasorin_abbott,plasma,5.20,DETECTED,27,DETECTED,11.210,DETECTED,5.04,DETECTED,500003,10947480.0,DETECTED
3,500004,oxford_positives,ee340e888492be0703f2bcc9abfb390c,12.0,2.0,critical,diasorin_abbott,plasma,5.60,DETECTED,21,DETECTED,10.200,DETECTED,> 10.00,DETECTED,500004,11173905.0,DETECTED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,500021,oxford_positives,091f9c87a83b30074ad9c46e777c68be,13.0,2.0,critical,diasorin_abbott,plasma,5.70,DETECTED,6,NOT detected,12.370,DETECTED,1.32,DETECTED,500021,10270554.0,DETECTED
21,500022,oxford_positives,49df3738cc60881c5c9ec52d026c9834,27.0,6.0,critical,diasorin_abbott,plasma,5.97,DETECTED,155,DETECTED,1.510,DETECTED,> 10.00,DETECTED,500022,10683744.0,DETECTED
22,500023,oxford_positives,0c3480e6514e939d74c5ffafb23e18db,7.0,1.0,severe,diasorin_abbott,plasma,2.14,DETECTED,8,NOT detected,3.890,DETECTED,2.18,DETECTED,500023,5960900.0,NOT detected
23,500024,oxford_positives,250e112653566dc2da8d03e71dc773f6,3.0,2.0,critical,diasorin_abbott,plasma,0.02,NOT detected,8,NOT detected,0.064,NOT detected,0.08,NOT detected,500024,1909549.0,NOT detected


In [9]:
pcr_df.describe()

Unnamed: 0,samplebarcode,dayssincesymptoms,dayssincepcrpos,abbott_result,roche_result,oxford_result
count,1757.0,382.0,703.0,1757.0,1753.0,1739.0
mean,726972.849744,23.243455,31.12091,1.970148,17.658256,6078679.0
std,198347.380063,16.984041,19.598852,2.694124,31.929679,4049094.0
min,500001.0,1.0,0.0,0.0,0.052,436655.0
25%,500443.0,10.0,8.0,0.02,0.069,2364344.0
50%,900117.0,18.0,38.0,0.07,0.077,3966071.0
75%,900559.0,35.0,46.0,4.33,20.99,10796330.0
max,901000.0,73.0,82.0,8.28,150.0,11707300.0


In [10]:
pcr_df.drop(columns=['participantnum', 'dayssincesymptoms',
                     'dayssincepcrpos', 'diseaseseverity'], inplace=True)

pcr_df.dropna(inplace=True)
pcr_df['covid_status'] = np.where(
    pcr_df['samplecollection'] == 'obb_negatives', 'Negative', 'Positive')


pcr_df['abbott_result'].replace(
    {0: 0.005}, inplace=True)  # replace 0 by min_value/2
pcr_df['diasorin_result'].replace(
    {"<3.80": 1.9}, inplace=True)  # replace <3.8 by 1.9
pcr_df['siemens_result'].replace(
    {"> 10.00": 10.}, inplace=True)  # replace >10 by 10
pcr_df['siemens_result'].replace(
    {"< 0.05": 0.03}, inplace=True)  # replace <0.05 by 0.03

pcr_df["random_result"] = np.random.rand(len(pcr_df))
pcr_df["perfect_result"] = pcr_df['covid_status'] == 'Positive'
pcr_df["perfect_result"] = pcr_df["perfect_result"].astype(int)
pcr_df["perfect_result"].replace(
    {0: 0.001}, inplace=True)  # replace 0 by 0.001 for display


df_reality_pos = pcr_df[pcr_df['covid_status'] == 'Positive']
df_reality_neg = pcr_df[pcr_df['covid_status'] == 'Negative']


dict_results_inv = {"abbott_result": "Abbott", "diasorin_result": "DiaSorin",
                    "oxford_result": "Oxford", "roche_result": "Roche", "siemens_result": "Siemens",
                    "random_result": "Random",  "perfect_result": "Ideal"}
dict_results = {"Abbott": "abbott_result", "DiaSorin" : "diasorin_result",
                "Oxford": "oxford_result", "Roche": "roche_result", "Siemens": "siemens_result",
                "Random": "random_result", "Ideal": "perfect_result"}

In [11]:
pcr_df

Unnamed: 0,samplebarcode,samplecollection,platformorder,sample_type,abbott_result,abbott_interpretation,diasorin_result,diasorin_interpretation,roche_result,roche_interpretation,siemens_result,siemens_interpretation,label,oxford_result,oxford_interpretation,covid_status,random_result,perfect_result
0,500001,oxford_positives,diasorin_abbott,plasma,0.02,NOT detected,1.9,NOT detected,0.074,NOT detected,0.10,NOT detected,500001,2738601.0,NOT detected,Positive,0.162930,1.000
1,500002,oxford_positives,diasorin_abbott,plasma,0.01,NOT detected,1.9,NOT detected,0.065,NOT detected,0.03,NOT detected,500002,768641.0,NOT detected,Positive,0.306324,1.000
2,500003,oxford_positives,diasorin_abbott,plasma,5.20,DETECTED,27.0,DETECTED,11.210,DETECTED,5.04,DETECTED,500003,10947480.0,DETECTED,Positive,0.514861,1.000
3,500004,oxford_positives,diasorin_abbott,plasma,5.60,DETECTED,21.0,DETECTED,10.200,DETECTED,10.00,DETECTED,500004,11173905.0,DETECTED,Positive,0.402285,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1753,900997,obb_negatives,abbott_diasorin,serum,0.09,NOT detected,4.0,NOT detected,0.082,NOT detected,0.09,NOT detected,OBB7998,1624313.0,NOT detected,Negative,0.520328,0.001
1754,900998,obb_negatives,abbott_diasorin,serum,0.06,NOT detected,1.9,NOT detected,0.067,NOT detected,0.03,NOT detected,OBB7999,1587419.0,NOT detected,Negative,0.682057,0.001
1755,900999,obb_negatives,abbott_diasorin,serum,0.02,NOT detected,1.9,NOT detected,0.070,NOT detected,0.03,NOT detected,OBB8000,3354589.0,NOT detected,Negative,0.243915,0.001
1756,901000,obb_negatives,abbott_diasorin,serum,0.03,NOT detected,1.9,NOT detected,0.065,NOT detected,0.18,NOT detected,OBB8001,2880579.0,NOT detected,Negative,0.438199,0.001


In [12]:
# plt.figure()
pcr_df["diasorin_result"].max()

289.0

In [13]:
def roc(thresholds, y_true, score):
    # Extracted from https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
    fpr = []  # false positive rate
    tpr = []  # true positive rate

    # get number of positive and negative examples in the dataset
    P = sum(y_true)
    N = len(y_true) - P

    # iterate through all thresholds and determine fraction of true positives
    # and false positives found at this threshold
    for thresh in thresholds:
        FP = 0
        TP = 0
        for i in range(len(score)):
            if (score[i] > thresh):
                if y_true[i] == True:
                    TP += 1
                else:
                    FP += 1
        fpr.append(FP/float(N))
        tpr.append(TP/float(P))
    return tpr, fpr

In [14]:
output = ipywidgets.Output()
test_name = 'abbott_result'

with output:
    fig = plt.figure(figsize=(8, 6),
                     #                      num='ROC Curve for Covid19 tests'
                     )
    gs = gridspec.GridSpec(1, 2, width_ratios=[1, 2])
    ax0 = plt.subplot(gs[0])
    ax1 = plt.subplot(gs[1])

fig.tight_layout(pad=3.0)

fig.suptitle("PCR Test name : " + dict_results_inv[test_name])
np.random.seed(123)
threshold = np.median(pcr_df[test_name])


idx_TP = df_reality_pos[test_name] >= threshold
idx_FN = df_reality_pos[test_name] < threshold
idx_FP = df_reality_neg[test_name] >= threshold
idx_TN = df_reality_neg[test_name] < threshold

n_reality_pos = len(df_reality_pos)
n_reality_neg = len(df_reality_neg)

jitter = 0.3
x_pos = jitter * (np.random.rand(n_reality_pos,)-0.5)
x_neg = 0.6 + jitter * (np.random.rand(n_reality_neg,)-0.5)

fig.canvas.toolbar_visible = False
ax0.set_yscale('log')
ylims = [pcr_df[test_name].min() / 2, pcr_df[test_name].max() * 2]
ax0.set_ylim(ylims)
ax0.set_xticks([0, 0.6])
ax0.set_xticklabels(['Postif', 'Negative'])

ax0.set_xlabel(' "True" covid status')
ax0.set_ylabel("Assay numerical result (log scale)")


pts_TP = ax0.scatter(x_pos[idx_TP], df_reality_pos[idx_TP][test_name], alpha=1., s=25, linewidth=0.1,
                     edgecolor="black", cmap="Paired", marker='o', color=colors[1], label="TP")
pts_FN = ax0.scatter(x_pos[idx_FN], df_reality_pos[idx_FN][test_name], alpha=1., s=20, linewidth=0.1,
                     cmap="Paired", marker="x", color=colors[1], label="FN")
pts_FP = ax0.scatter(x_neg[idx_FP], df_reality_neg[idx_FP][test_name], alpha=1., s=25, linewidth=0.1,
                     edgecolor="black", cmap="Paired", marker='o', color=colors[0], label="FP")
pts_TN = ax0.scatter(x_neg[idx_TN], df_reality_neg[idx_TN][test_name], alpha=1., s=20, linewidth=0.1,
                     cmap="Paired", marker="x", color=colors[0], label="TN")

xlims = ax0.axes.get_xlim()
line, = ax0.plot(xlims,  [threshold, threshold],
                 '-', color='k', label="Threshold")


handles, labels = ax0.get_legend_handles_labels()
handles_list = handles[1:] + handles[:1]
labels_list = labels[1:] + labels[:1]
ax0.legend(handles_list, labels_list, loc='upper center', bbox_to_anchor=(0.5, 1.15),
           ncol=3, fancybox=True, shadow=True, prop={'size': 8})


TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))


roc_pts, = ax1.plot(FPR, TPR, "o", color='k')
ax1.set_xlim([-0.1, 1.1])
ax1.set_ylim([-0.1, 1.1])
ax1.set_ylabel(r"$\mathrm{TPR} = \frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FN}}$")
ax1.set_xlabel(r"$\mathrm{FPR} = \frac{\mathrm{FP}}{\mathrm{TN}+\mathrm{FP}}$")
ax1.set_title("ROC curve")

Positives = pcr_df['covid_status'] == 'Positive'
tpr, fpr = roc(np.logspace(np.log10(ylims[0]), np.log10(
    ylims[1]), num=100), Positives.values, pcr_df[test_name].values)
roc_curve, = ax1.plot(fpr, tpr, "-", color='k')

offset = 50
bbox = dict(boxstyle="round", fc="0.99")
arrowprops = dict(arrowstyle="simple",
                  color="k",
                  shrinkB=5,
                  connectionstyle="arc3,rad=0.1",
                  )

disp = ax1.annotate('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR),
                    (FPR, TPR), xytext=(600, 400), textcoords='figure pixels',
                    bbox=bbox, arrowprops=arrowprops)
plt.tight_layout()
plt.show()


def update_threshold(change):
    test_name = dict_results[radiobuttons.value]
    threshold = change.new
    idx_TP = df_reality_pos[test_name] >= threshold
    idx_FN = df_reality_pos[test_name] < threshold
    idx_FP = df_reality_neg[test_name] >= threshold
    idx_TN = df_reality_neg[test_name] < threshold

    pts_TP.set_offsets(
        np.c_[x_pos[idx_TP], df_reality_pos[idx_TP][test_name]])
    pts_FN.set_offsets(
        np.c_[x_pos[idx_FN], df_reality_pos[idx_FN][test_name]])
    pts_FP.set_offsets(
        np.c_[x_neg[idx_FP], df_reality_neg[idx_FP][test_name]])
    pts_TN.set_offsets(
        np.c_[x_neg[idx_TN], df_reality_neg[idx_TN][test_name]])

    line.set_ydata([threshold, threshold])

    TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
    FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))

    disp.set_text('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR))
    disp.xy = FPR, TPR
    roc_pts.set_ydata(TPR)
    roc_pts.set_xdata(FPR)

    fig.canvas.draw()


def update_name(change):
    test_name = dict_results[change.new]

    threshold = np.median(pcr_df[test_name])
    # XXX TODO change to harmonize the upper/lower gaps.
    ylims = [pcr_df[test_name].min() / 2, pcr_df[test_name].max() * 2]
    ax0.set_ylim(ylims)

    # done to avoid bug if min>max when updating.
    threshold_slider.max = np.infty
    threshold_slider.min = np.log10(ylims[0])
    threshold_slider.max = np.log10(ylims[1])
    threshold_slider.value = np.median(pcr_df[test_name])

    fig.suptitle("PCR Test name : " + dict_results_inv[test_name])

    idx_TP = df_reality_pos[test_name] >= threshold
    idx_FN = df_reality_pos[test_name] < threshold
    idx_FP = df_reality_neg[test_name] >= threshold
    idx_TN = df_reality_neg[test_name] < threshold

    pts_TP.set_offsets(
        np.c_[x_pos[idx_TP], df_reality_pos[idx_TP][test_name]])
    pts_FN.set_offsets(
        np.c_[x_pos[idx_FN], df_reality_pos[idx_FN][test_name]])
    pts_FP.set_offsets(
        np.c_[x_neg[idx_FP], df_reality_neg[idx_FP][test_name]])
    pts_TN.set_offsets(
        np.c_[x_neg[idx_TN], df_reality_neg[idx_TN][test_name]])

    line.set_ydata([threshold, threshold])

    TPR = np.sum(idx_TP) / (np.sum(idx_FN) + np.sum(idx_TP))
    FPR = np.sum(idx_FP) / (np.sum(idx_TN) + np.sum(idx_FP))

    disp.set_text('(FPR, TPR) = (%.3f, %.3f)' % (FPR, TPR))
    disp.xy = FPR, TPR
    roc_pts.set_ydata(TPR)
    roc_pts.set_xdata(FPR)
    Positives = pcr_df['covid_status'] == 'Positive'
    tpr, fpr = roc(np.logspace(np.log10(ylims[0]), np.log10(
        ylims[1]), num=100), Positives.values, pcr_df[test_name].values)
    roc_curve.set_ydata(tpr)
    roc_curve.set_xdata(fpr)

    fig.canvas.draw()


style = {'description_width': '100px'}
layout = {'width': '90px'}

threshold_slider = ipywidgets.FloatLogSlider(
    min=np.log10(ylims[0]), max=np.log10(ylims[1]), value=np.median(pcr_df[test_name]), base=10, step=0.1, description='Threshold', style=style, layout=layout, orientation='vertical')

radiobuttons = ipywidgets.RadioButtons(
    value='Abbott',
    options=list(dict_results_inv.values()),
    description='',
    style=style, layout=layout
)

controls = ipywidgets.VBox([
    threshold_slider,
    radiobuttons
])


threshold_slider.observe(update_threshold, 'value')
radiobuttons.observe(update_name, 'value')


controls.layout = make_box_layout()
out_box = ipywidgets.Box([output])
output.layout = make_box_layout()
box = ipywidgets.HBox([controls, output])
box

HBox(children=(VBox(children=(FloatLogSlider(value=0.07, description='Threshold', layout=Layout(width='90px'),…