# Sensitivity analysis
For research gaps found, we simulate what would have been the mapping of RCTs within regions across diseases if the misclassification of RCTs towards groups of diseases was corrected, given the sensitivities and specificities of the classifier to identify each group of disease.

To estimate the performances of the classifier for each group of diseases, we dispose a test set with 2,763 trials manually classified towards the 27-class grouping of diseases used in this work. The test set is described at Atal et al. BMC Bioinformatics 2016.

The method used is based on the method presented at Fox et al. Int J Epidemiol 2005.

To do so, for each disease for which we found a local research gap we will:

1. Calculating sensitivity and specificity of the classifier to identify the disease and other studies relevant to the burden of diseases, and the number of success and number of trials to derive beta distributions
2. Doing N=60k times the following simulation
    * Randomly choose a sens and spec based on beta distribution for identifying the disease and identifying another disease (no correlation between sens and spec, neither between disease and another disease both)
    * Derive Positive and Negative Predictive Values (PPV and NPV) for each.
    * Simulate the correction of the classification based on PPVs and NPVs
    * Derive the proportion of RCTs concerning the disease among all RCTs concerning the burden of disease in the region
3. Derive 95% upper-bond simulation interval of the proportion of RCTs concerning the disease among all RCTs concerning the burden of diseases

## 1. Sensitivities and specificities based on test set

In [1]:
test_set <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MetaMapBurden/Paper_classifier/NCT_data_classified_to28cats.txt")
dim(test_set)

In [2]:
#We supress injuries from trials concerning the burden of diseases
test_set$GBDnp <- sapply(strsplit(as.character(test_set$GBDnp),"&&"),function(x){paste(x[x!="28"],collapse="&")})
test_set$GBD28 <- sapply(strsplit(as.character(test_set$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})

In [3]:
tst <- strsplit(test_set$GBDnp,"&")
alg <- strsplit(test_set$GBD28,"&")
tst <- lapply(tst,as.numeric)
alg <- lapply(alg,as.numeric)

In [4]:
source('Evaluation_metrics.R')

In [5]:
dis <- 1:27
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")

In [6]:
#For each category in 1:27, TP, TN, FP and FN of finding the disease and of finding another disease
set.seed(7212)

dis <- as.character(1:27)

PERF_F  <- data.frame()
for(i in dis){
    ALG <- lapply(alg,function(x){rs <- c()
                                  if(i%in%x) rs <- c(1)
                                  if(sum(setdiff(dis,i)%in%x)!=0) rs <- c(rs,2)
                                  return(rs)
                                      })

    DT <- lapply(tst,function(x){rs <- c()
                                if(i%in%x) rs <- c(1)
                                if(sum(setdiff(dis,i)%in%x)!=0) rs <- c(rs,2)
                                return(rs)
                                    })

    CM <- conf_matrix(ALG,DT,c(1,2))

    PERF <- c(CM[1,],CM[2,])
    PERF_F <- rbind(PERF_F,PERF)
}


In [7]:
PERF_F <- data.frame(PERF_F)
names(PERF_F) <- paste(rep(c("TP","FP","TN","FN"),2),rep(c("_Dis","_Oth"),each=4),sep="")

In [8]:
PERF_F$dis <- dis
PERF_F$GBD <- Mgbd$cause_name[-28]

In [9]:
PERF_F <- PERF_F[,c(9,10,1:8)]

In [10]:
PERF_F

Unnamed: 0,dis,GBD,TP_Dis,FP_Dis,TN_Dis,FN_Dis,TP_Oth,FP_Oth,TN_Oth,FN_Oth
1,1,Tuberculosis,14,2,2745,2,2142,204,267,150
2,2,HIV/AIDS,86,7,2659,11,2072,214,333,144
3,3,"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",40,21,2693,9,2113,207,299,144
4,4,Malaria,14,1,2748,0,2142,204,267,150
5,5,Neglected tropical diseases excluding malaria,6,0,2756,1,2150,203,261,149
6,6,Maternal disorders,17,5,2715,26,2130,210,289,134
7,7,Neonatal disorders,4,7,2746,6,2148,205,262,148
8,8,Nutritional deficiencies,11,15,2732,5,2140,201,272,150
9,9,Sexually transmitted diseases excluding HIV,0,3,2759,1,2155,203,255,150
10,10,Hepatitis,14,4,2742,3,2141,208,262,152


In [11]:
write.csv(PERF_F,'Tables/Performances_per_27disease_data.csv')

## 2. Simulating correction of misclassification

In [1]:
data <- read.table('/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Flowchart/database_all_diseases_final_ok.txt')
N <- nrow(data)

In [2]:
regs <- sort(unique(unlist(strsplit(as.character(data$Regions),"&"))))
LR <- lapply(regs,function(x){1:nrow(data)%in%grep(x,data$Regions)})
LR <- do.call('cbind',LR)

In [3]:
Lgbd <- lapply(as.character(data$GBD28),function(x){as.numeric(unlist(strsplit(x,"&")))})
Lgbd <- lapply(Lgbd,function(x){x[x!=28]})

In [4]:
PERF <- read.csv('Tables/Performances_per_27disease_data.csv')

In [6]:
GAPS <- read.table('/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Tables/Local_research_gaps.txt')

In [8]:
#For each disease for which we found a gap, we will simulate the mapping across regions of trials concerning
#the disease or concerning other diseases
dis <- levels(GAPS$Disease)

In [9]:
length(dis)

In [7]:
#NK <- 60000
NK <- 5000
set.seed(7212)

In [8]:
#For each disease
t0 <- proc.time()

for(g in dis){

    PERF_g <- PERF[PERF$GBD==g,]
    
    #which trials concern the disease
    is_dis <- sapply(Lgbd,function(x){PERF_g$dis%in%x})
    #which trials concern another disease
    is_oth <- sapply(Lgbd,function(x){sum(setdiff(1:27,PERF_g$dis)%in%x)>0})

    #PPV et NPVs for finding the disease
    sens_r <- PERF_g$TP_Dis
    sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
    spec_r <- PERF_g$TN_Dis
    spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
    sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
    spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)

    a_dis <- sum(is_dis)
    b_dis <- N-a_dis
    As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
    Bs <- N-As
    T1 <- sens*As
    T0 <- spec*Bs
    F1 <- (1-spec)*Bs
    F0 <- (1-sens)*As
    PPV_dis <- T1/(T1+F1)
    NPV_dis <- T0/(T0+F0)

    #PPV and NPVs for finding another disease
    sens_r <- PERF_g$TP_Oth
    sens_n <- PERF_g$TP_Oth + PERF_g$FN_Oth
    spec_r <- PERF_g$TN_Oth
    spec_n <- PERF_g$TN_Oth + PERF_g$FP_Oth
    sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
    spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)

    a_oth <- sum(is_oth)
    b_oth <- N-a_oth
    As <- (a_oth-(1-spec)*N)/(sens - (1-spec))
    Bs <- N-As
    T1 <- sens*As
    T0 <- spec*Bs
    F1 <- (1-spec)*Bs
    F0 <- (1-sens)*As
    PPV_oth <- T1/(T1+F1)
    NPV_oth <- T0/(T0+F0)

    #Some values of sens and spec may lead to impossible values of PPV or NPV (>1 or <0)
    #If PPV_dis < 0 (and NPV_dis > 1), sens_dis and spec_dis are such that expected value
    #of nb_trials concerning disease is negative. Suppressing that iteration is conservative
    #if regard of our objective
    #If PPV_dis > 1 (and NPV_dis < 0), suppressing that iteration is not conservative
    #Similarly, if PPV_oth < 0 (and NPV_oth > 1), suppressing that iteration is conservative, and so on
    #We count conservative and non conservative deleted iterations
    #If more than 10% of iteration are deleted because they are not conservative, we can not conduct simulation
    #Conservative iterations are suppressed, and no problem
    cons_false_it <- PPV_dis<0 | NPV_dis>1 | PPV_oth>1 | NPV_oth<0
    non_cons_false_it <- PPV_dis>1 | NPV_dis<0 | PPV_oth<0 | NPV_oth>1

    if(sum(non_cons_false_it)>0.1*NK){ print(paste(c(g,
                                                     "has", 
                                                     sum(non_cons_false_it), 
                                                     "(too many) non conservative false iterations, and",
                                                     sum(cons_false_it),
                                                     "conservative false iterations"
                                                    ),collapse=" "))
                          next
                          }
    
    print(paste(c(g,
                  "has",
                  sum(non_cons_false_it),
                  "non conservative false iterations, and",
                  sum(cons_false_it),
                  "conservative false iterations"
                 ),collapse=" "))
    
    PPV_dis <- PPV_dis[!cons_false_it & !non_cons_false_it]
    NPV_dis <- NPV_dis[!cons_false_it & !non_cons_false_it]
    PPV_oth <- PPV_oth[!cons_false_it & !non_cons_false_it]
    NPV_oth <- NPV_oth[!cons_false_it & !non_cons_false_it]

    L <- list()
    #Simulation: reclassifying each trial
        for(k in 1:sum(!cons_false_it & !non_cons_false_it)){

            AR <- matrix(0, nrow=length(regs), ncol=2)
            tp_dis <- runif(a_dis)
            tn_dis <- runif(b_dis)
            recl_dis <- is_dis
            recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
            recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
            #Rq: we count all trials (even those with more than 3 diseases)
            #it is a conservative choice
            rt <- as.numeric(recl_dis)

            if(sum(recl_dis)==0) AR[,1] <- c(rep(0,7))
            else{   if(sum(recl_dis)==1) AR[,1] <- as.numeric(LR[recl_dis,])
                    else AR[,1] <- apply(LR[recl_dis,],2,sum)
            }
                
            #Oth_dis
            tp_oth <- runif(a_oth)
            tn_oth <- runif(b_oth)
            recl_oth <- is_oth
            recl_oth[recl_oth==TRUE][tp_oth>PPV_oth[k]] <- FALSE
            recl_oth[recl_oth==FALSE][tn_oth>NPV_oth[k]] <- TRUE
            rt <- rt + as.numeric(recl_oth)

            if(sum(recl_oth)==0) AR[,2] <- c(rep(0,7))
            else{    if(sum(recl_oth)==1) AR[,2] <- as.numeric(LR[recl_oth,])
                     else AR[,2] <- apply(LR[rt!=0,],2,sum)
            }

            L[[k]] <- AR

        }
   
    T <- do.call('rbind',L)
    write.table(T,paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Res_simul27_",as.character(PERF_g$dis),".txt"),collapse=""))

}

t1 <- proc.time()
    
print(t1-t0)/60

[1] "Cardiovascular and circulatory diseases has 0 non conservative false iterations, and 0 conservative false iterations"
[1] "Congenital anomalies has 0 non conservative false iterations, and 693 conservative false iterations"
[1] "Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases has 0 non conservative false iterations, and 0 conservative false iterations"
[1] "Neglected tropical diseases excluding malaria has 0 non conservative false iterations, and 2 conservative false iterations"
[1] "Neonatal disorders has 0 non conservative false iterations, and 2 conservative false iterations"
[1] "Nutritional deficiencies has 0 non conservative false iterations, and 1 conservative false iterations"
[1] "Sexually transmitted diseases excluding HIV has 16 non conservative false iterations, and 505 conservative false iterations"
[1] "Tuberculosis has 0 non conservative false iterations, and 37 conservative false iterations"
    user   system  elapsed 
2952.

       user      system     elapsed 
49.21641667  0.07173333 49.48468333 

In [11]:
(49*60/5)/60

For 60,000 simulations it will take 10h approx

## 3. Deriving 95% upper bound simulation intervals

In [66]:
GAPS <- read.table('/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Tables/Local_research_gaps.txt')

In [67]:
regs <- levels(GAPS$Region)

In [68]:
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")

In [69]:
sims <- list.files("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations")

In [70]:
#diseases for which we have a simulation
dis <- as.character(Mgbd$cause_name[as.numeric(substr(sims,13,nchar(sims)-4))])

In [71]:
GAPS$SimMn_NbRCTs <- NA
GAPS$Sim95_NbRCTs <- NA
GAPS$SimMn_PrRCTs <- NA
GAPS$Sim95_PrRCTs <- NA

In [72]:
for(g in dis){

    T <- tryCatch(read.table(paste(c("/media/igna/Elements/HotelDieu/Cochrane/Mapping_Cancer/Incertitude_mapping/Simulations/Res_simul27_",
                            as.character(which(Mgbd$cause_name==g)),".txt"),collapse="")),error=NULL)

    if(length(T)!=0){

        #Mean and 95% upper-bound number of RCTs by simulation
        M <- matrix(T[,1],ncol=7,byrow=TRUE)
        SIM95 <- apply(M,2,function(x){quantile(x,0.95)})
        SIM95 <- data.frame(SIM95)
        GAPS$Sim95_NbRCTs[GAPS$Disease==g] <- SIM95$SIM95[regs%in%GAPS$Region[GAPS$Disease==g]]
        SIMMn <- apply(M,2,function(x){mean(x)})
        SIMMn <- data.frame(SIMMn)
        GAPS$SimMn_NbRCTs[GAPS$Disease==g] <- SIMMn$SIMMn[regs%in%GAPS$Region[GAPS$Disease==g]]

        #Mean and 95% upper-bound proportion of RCTs by simulation
        M <- matrix(T[,1]/T[,2],ncol=7,byrow=TRUE)
        SIM95 <- apply(M,2,function(x){quantile(x,0.95)})
        SIM95 <- data.frame(SIM95)
        GAPS$Sim95_PrRCTs[GAPS$Disease==g] <- SIM95$SIM95[regs%in%GAPS$Region[GAPS$Disease==g]]
        SIMMn <- apply(M,2,function(x){mean(x)})
        SIMMn <- data.frame(SIMMn)
        GAPS$SimMn_PrRCTs[GAPS$Disease==g] <- SIMMn$SIMMn[regs%in%GAPS$Region[GAPS$Disease==g]]
    }   
}

In [73]:
GAPS$Robust <- FALSE

In [74]:
GAPS$Robust[2*GAPS$Sim95_PrRCTs<GAPS$Pr_GBD] <- TRUE

In [75]:
GAPS[,c(7,9,12,13)] <- round(100*GAPS[,c(7,9,12,13)],digits=2)
GAPS[,c(10)] <- round(GAPS[,c(10)],digits=1)

In [76]:
GAPS[GAPS$Robust,c(4,3,6,10,11,7,12,13,9)]

Unnamed: 0,Region,Disease,RCT,SimMn_NbRCTs,Sim95_NbRCTs,Pr_RCT,SimMn_PrRCTs,Sim95_PrRCTs,Pr_GBD
1,"Central Europe, Eastern Europe, and Central Asia",Cardiovascular and circulatory diseases,789,768.7,843,11.27,12.56,13.77,35.09
3,"Central Europe, Eastern Europe, and Central Asia",Tuberculosis,21,17.2,27,0.3,0.28,0.44,1.83
5,Latin America and Caribbean,Congenital anomalies,55,12.7,27,1.15,0.3,0.62,3.13
6,Latin America and Caribbean,Neonatal disorders,41,75.4,149,0.86,1.77,3.47,8.72
8,North Africa and Middle East,Congenital anomalies,147,32.3,66,1.69,0.4,0.82,4.98
9,North Africa and Middle East,"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",253,262.2,319,2.91,3.33,4.06,10.55
10,North Africa and Middle East,Neonatal disorders,181,213.1,366,2.08,2.68,4.56,10.06
12,South Asia,"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",357,308.8,353,7.36,7.04,8.06,21.43
13,South Asia,Neonatal disorders,86,109.2,195,1.77,2.46,4.35,16.52
14,South Asia,Tuberculosis,54,38.3,54,1.11,0.86,1.22,3.87


In [77]:
GAPS[!GAPS$Robust,c(4,3,6,10,11,7,12,13,9)]

Unnamed: 0,Region,Disease,RCT,SimMn_NbRCTs,Sim95_NbRCTs,Pr_RCT,SimMn_PrRCTs,Sim95_PrRCTs,Pr_GBD
2,"Central Europe, Eastern Europe, and Central Asia",Neonatal disorders,38,90.6,187.0,0.54,1.48,3.06,2.56
4,High-income,Neonatal disorders,553,1113.3,2229.9,0.82,1.8,3.59,1.65
7,Latin America and Caribbean,Nutritional deficiencies,78,65.9,99.0,1.64,1.55,2.31,3.76
11,North Africa and Middle East,Nutritional deficiencies,134,119.1,180.0,1.54,1.5,2.26,4.12
15,"Southeast Asia, East Asia and Oceania",Neglected tropical diseases excluding malaria,65,72.2,95.1499999999996,0.67,0.83,1.1,1.55
16,"Southeast Asia, East Asia and Oceania",Neonatal disorders,47,132.3,279.299999999999,0.49,1.53,3.2,6.06
21,Sub-Saharian Africa,Sexually transmitted diseases excluding HIV,9,40.8,131.0,0.35,1.71,5.67,1.27


In [62]:
GAPS <- GAPS[,c(4,3,6,10,11,7,12,13,9,14)]
GAPS <- GAPS[order(GAPS$Robust,decreasing=TRUE),]

In [63]:
GAPS

Unnamed: 0,Region,Disease,RCT,SimMn_NbRCTs,Sim95_NbRCTs,Pr_RCT,SimMn_PrRCTs,Sim95_PrRCTs,Pr_GBD,Robust
1,"Central Europe, Eastern Europe, and Central Asia",Cardiovascular and circulatory diseases,789,768.7,843.0,11.27,12.56,13.77,35.09,1
3,"Central Europe, Eastern Europe, and Central Asia",Tuberculosis,21,17.2,27.0,0.3,0.28,0.44,1.83,1
5,Latin America and Caribbean,Congenital anomalies,55,12.7,27.0,1.15,0.3,0.62,3.13,1
6,Latin America and Caribbean,Neonatal disorders,41,75.4,149.0,0.86,1.77,3.47,8.72,1
8,North Africa and Middle East,Congenital anomalies,147,32.3,66.0,1.69,0.4,0.82,4.98,1
9,North Africa and Middle East,"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",253,262.2,319.0,2.91,3.33,4.06,10.55,1
10,North Africa and Middle East,Neonatal disorders,181,213.1,366.0,2.08,2.68,4.56,10.06,1
12,South Asia,"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",357,308.8,353.0,7.36,7.04,8.06,21.43,1
13,South Asia,Neonatal disorders,86,109.2,195.0,1.77,2.46,4.35,16.52,1
14,South Asia,Tuberculosis,54,38.3,54.0,1.11,0.86,1.22,3.87,1


In [64]:
write.csv(GAPS,'Tables/Gaps_RCTs_and_robustness.csv')