# Calculate ROC-AUC, AUC-CI and Precision 

* Calculate Area Under the Receiveing Operator Curve using the network proximity measures ($d_c$ and $Z_{d_c}$)
* Calculate 95% confidence intervals using the bootstrap technique with 2,000 resamplings with sample sizes of 150 each
* Calculate precision of the top 10 predictions, considering only the polyphenol-disease associations with relative distance $Z_{d_c} < -0.5$

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from progressbar import ProgressBar
from multiprocessing import Pool
import scipy
import os
import sys
sys.path.append('../')

In [2]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ShuffleSplit

In [3]:
basedir = os.path.abspath('../') + '/'
infolder = basedir + 'data/'
outfolder = basedir + 'output/'
dbs = basedir + 'data/databases/'

In [4]:
import utils.network_utils as network_utils


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
proximity_file = infolder + 'SupplementaryData2.csv'
ctd_file = infolder + 'ctd_polyphenols_implicit_explicit.csv'

In [7]:
ctd = pd.read_csv(ctd_file,index_col = 0)


### calculate AUC using each measure in measures
measures = ['closest', 'z_closest']

# negative measures
### measures in which lowers values represent more significance
negative_measures = ['closest', 'z_closest']

## calculate AUC only for chemicals (or diseases) in selected_chemicals (selected_diseases)
selected_chemicals = ['gallic acid']
selected_diseases = None

target_columns = ['chemical','disease']

## column flagging explicit evidence
explicit = 'DirectEvidence'

## column flagging implicit evidence
implicit = 'therapeutic'

In [8]:
## file containing proximity calculations
dt = pd.read_csv(proximity_file)
print (dt.shape)
dt['chemical'] = [i.lower() for i in dt.chemical]


## merge proximity calculations and CTD associations
dx = pd.merge(dt[target_columns + measures], ctd, 
              on =target_columns, 
              how='outer')

if 'n_mapped_chemical' in dx.columns:
    if selected_diseases:
        dx = dx[(dx.disease.isin(selected_diseases)) & (~dx.n_mapped_chemical.isnull())]
    if selected_chemicals:
        dx = dx[(dx.chemical.isin(selected_chemicals)) & (~dx.n_mapped_chemical.isnull())]
else:
    if selected_diseases:
        dx = dx[(dx.disease.isin(selected_diseases))]
    if selected_chemicals:
        dx = dx[(dx.chemical.isin(selected_chemicals))]

dx = dx[~dx[measures[0]].isnull()]
print (dx.shape)
dx.head()

(19734, 8)
(299, 6)


Unnamed: 0,chemical,disease,closest,z_closest,DirectEvidence,therapeutic
598,gallic acid,liver diseases,1.0,-1.720618,,1.0
599,gallic acid,lung diseases,2.0,0.661377,,
600,gallic acid,overweight,2.0,-0.030014,,
601,gallic acid,bone marrow diseases,2.0,0.6298,,1.0
602,gallic acid,tauopathies,2.0,0.416784,,


In [9]:
dx['therapeutic'] = dx['therapeutic'].fillna(0)
dx['buf'] = dx.DirectEvidence
dx.loc[~dx.DirectEvidence.isnull(),'DirectEvidence'] = 1
dx.loc[dx.DirectEvidence.isnull(),'DirectEvidence'] = 0
del dx['buf']


print ('Explicit')
print (dx[dx[explicit] == 1].shape[0], 'known')
print (dx[dx[explicit] == 0].shape[0], 'unknown')
print ('Implicit + Explicit')
print (dx[dx.therapeutic == 1].shape[0], 'known')
print (dx[dx.therapeutic == 0].shape[0], 'unknown')

Explicit
5 known
294 unknown
Implicit + Explicit
42 known
257 unknown


In [10]:
def calculcate_performance(chemical, measures=measures, label = 'therapeutic',
                          precision = True):
    
    dw = dx[dx.chemical == chemical]
    
    res = defaultdict(dict)
    x = 0 ## counter
    for col in measures:
        sub = dw[[label, col]]
        fpr, tpr, thresholds = metrics.roc_curve(1 - sub[label], sub[col])
        roc_auc = metrics.auc(fpr, tpr)
        ### bootstrap
        sub = sub.reset_index()
        rng = np.random.RandomState(42)
        bootstraps = []
        for j in range(2000):
            # bootstrap by sampling with replacement on the prediction indices
            indices = rng.random_integers(0, len(sub.index) - 1, 150)
            boot = sub.loc[indices]
            while boot[boot[label] == 1].shape[0] == 0:
                indices = rng.random_integers(0, len(sub.index) - 1, 150)
                boot = sub.loc[indices]
            fpr, tpr, thresholds = metrics.roc_curve(1 - boot[label], boot[col])
            roc_auc_b = metrics.auc(fpr, tpr)
            bootstraps.append(roc_auc_b)
        bootstraps.sort()
        s_lower = np.percentile(bootstraps, 2.5)
        s_upper = np.percentile(bootstraps, 97.5)
        res[x]['value'] = roc_auc
        res[x]['measure'] = col
        res[x]['ci_upper'] = s_upper
        res[x]['ci_lower'] = s_lower
        res[x]['error_l'] = roc_auc - s_lower
        res[x]['error_u'] = s_upper - roc_auc
        
        
        ## precision - top absolute proximity that z < -0.5
        if not 'z' in col:
            for ntop in [10, 25, 50]:
                sub = dw[[label, col, 'z_%s'%col]]
                sub = sub[sub['z_%s'%col] < -0.5]
                pre = 0
                if not sub.shape[0] == 0:
                    sub = sub.sort_values(by = col)
                    top = sub.iloc[:ntop]
                    tp = top[top[label] == 1].shape[0]
                    fp = top[top[label] == 0].shape[0]
                    pre = 1.*tp/(tp + fp)
                res[x]['prec_relative_top%d'%ntop] = pre
            
        x = x+1


    table = pd.DataFrame.from_dict(res,orient='index')
    table['chemical'] = chemical
    return (table)

In [11]:
p = Pool(8)
samples = list(set(dx.chemical))
res = p.map(calculcate_performance, samples)
p.close()
df = pd.concat(res)

In [12]:
df = pd.concat(res)


In [13]:
df

Unnamed: 0,value,measure,ci_upper,ci_lower,error_l,error_u,prec_relative_top10,prec_relative_top25,prec_relative_top50,chemical
0,0.695711,closest,0.809161,0.580788,0.114923,0.113451,0.5,0.4,0.3,gallic acid
1,0.603344,z_closest,0.749706,0.455282,0.148062,0.146362,,,,gallic acid


In [None]:
df.to_csv('Performance.csv', index=None)