Mapping the effort of health research
-------------------------------------
Database: 
          1. All RCTs registered at WHO ICTRP by Jan 1st 2016, 
          2. with start date between 2006 and 2015
          3. with study type and design corresponding to RCT
          4. with at least one country location among the 187 countries included in the GBD2010 study

In [1]:
library(data.table)
#Upload database
setwd('/media/igna/Elements/HotelDieu/Cochrane')
data0 <- read.table("Mapping_Cancer/Flowchart/database_all_diseases_final_ok.txt")
nrow(data0)
names(data0)

- TrialID: unique trial ID from WHOICTRP
- Regions: 7 epidemiological regions from GBD 2010 study
- GBD28: classification according to 28 categories defined in Atal et al. BMC Bioinformatics (2016): This classification includes the injuries category, we exclude it

In [2]:
data <- data.table(data0)
tables()

     NAME    NROW NCOL  MB
[1,] data 117,180   28 103
     COLS                                                                            
[1,] TrialID,brief_title,official_title,Primary_sponsor,Source_Register,Recruitment_S
     KEY
[1,]    
Total: 103MB


In [4]:
#Upload traduction names/label categories
Mgbd <- read.table("/home/igna/Desktop/Programs GBD/Classifier_Trial_GBD/Databases/Taxonomy_DL/GBD_data/GBD_ICD.txt")
grep("Injur",Mgbd$cause_name)

In [5]:
#We supress from GBD28 the label 28
GBD27 <- sapply(strsplit(as.character(data$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})
head(GBD27)
#Number of trials relevant to the burden of diseases
table(GBD27=="")
table(data$GBD28=="")
data <- data[GBD27!="",]
GBD27 <- GBD27[GBD27!=""]


FALSE  TRUE 
91763 25417 


FALSE  TRUE 
92547 24633 

In [6]:
#Trials including sample size information
table(is.na(data$Sample[GBD27!=""]))


FALSE  TRUE 
85358  6405 

In [7]:
head(data$Countries)

In [8]:
ctrs <- unlist(strsplit(as.character(data$Countries[GBD27!=""]),"&"))

In [9]:
length(unique(ctrs))
length(grep("&",as.character(data$Regions[GBD27!=""])))
length(grep("&",as.character(data$Regions[GBD27!=""])))/sum(GBD27!="")

In [10]:
regs <- unlist(strsplit(as.character(data$Regions[GBD27!=""]),"&"))

In [11]:
head(regs)

In [12]:
table(regs)
#We count the number of RCTs per region before suppressing RCTs with more than 4 diseases

regs
Central Europe, Eastern Europe, and Central Asia 
                                            6998 
                                     High-income 
                                           67659 
                     Latin America and Caribbean 
                                            4766 
                    North Africa and Middle East 
                                            8683 
                                      South Asia 
                                            4848 
           Southeast Asia, East Asia and Oceania 
                                            9639 
                             Sub-Saharian Africa 
                                            2585 

In [13]:
L <- strsplit(GBD27,"&")

In [14]:
table(sapply(L,length))
table(sapply(L,length)>1)
table(sapply(L,length)>1)/length(GBD27)
table(sapply(L,length)>4)
table(sapply(L,length)>4)/length(GBD27)


    1     2     3     4     5     6     7     8     9 
82914  7122  1401   259    44    12     7     3     1 


FALSE  TRUE 
82914  8849 


    FALSE      TRUE 
0.9035668 0.0964332 


FALSE  TRUE 
91696    67 


       FALSE         TRUE 
0.9992698582 0.0007301418 

In [15]:
#We supress RCTs with more than 4 diseases
data <- data[sapply(L,length)<=4,]
GBD27 <- GBD27[sapply(L,length)<=4]

In [16]:
L <- L[sapply(L,length)<=4]

In [17]:
sort(table(unlist(L)))


   27    11    21     9     1     5     4     7    10    26     6     8    15 
    3    99   233   282   365   428   482   914   925  1068  1126  1509  1566 
    2    23    20     3    24    14    16    25    17    22    18    13    19 
 1647  1756  2494  3696  4463  5168  6109  6444  7038  7761 10080 10547 12045 
   12 
14149 

In [18]:
Mgbd$cause_name[c(12,19,13)]

In [19]:
Mgbd$cause_name[c(27,11,21)]

### Database number RCTs per region and disease

In [20]:
#Database number RCTs per region and disease
L <- lapply(L,as.numeric)
#Region names
RGS <- sort(unique(regs))
RGS
length(L)
#We count only RCTs concerning 3 or less diseases
dim(data)

In [21]:

Mt <- matrix(0,ncol=length(RGS),nrow=nrow(Mgbd)-1)

t0 <- proc.time()
for(i in 1:length(RGS)){
for(j in 1:(nrow(Mgbd)-1)){
Mt[j,i] <- sum((1:nrow(data)%in%grep(RGS[i],data$Regions)) & unlist(lapply(L,function(x){j%in%x})))
}
}

Mt <- data.frame(Mt)
t1 <- proc.time()
(t1 - t0)/60

        user       system      elapsed 
2.374317e+00 6.666667e-05 2.376367e+00 

In [22]:
names(Mt) <- RGS
rownames(Mt) <- as.character(Mgbd$cause_name)[-28]

In [23]:
head(Mt)

Unnamed: 0,"Central Europe, Eastern Europe, and Central Asia",High-income,Latin America and Caribbean,North Africa and Middle East,South Asia,"Southeast Asia, East Asia and Oceania",Sub-Saharian Africa
Tuberculosis,21,91,39,19,54,70,151
HIV/AIDS,76,1076,105,35,64,145,395
"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",288,2534,248,253,357,437,150
Malaria,0,71,12,0,48,68,321
Neglected tropical diseases excluding malaria,5,136,78,36,50,65,73
Maternal disorders,13,511,48,387,79,84,43


We add column Tot with total number of RCTs per disease and row Tot with total number of RCTs per region (different from addition of rows and columns because of international trials and trials concerning several diseases)



In [24]:
Mt <- rbind(Mt,table(regs))
rownames(Mt)[28] <- "Tot"
#In regs, are also counted RCTs concerning more than 4 diseases, as we want to compare
#regions across RCTs concerning the burden of diseases

In [25]:
Mt <- cbind(Mt,c(table(unlist(L)),nrow(data)))
#When counting nb RCTs per disease, we do not count RCTs concerning more than 4 diseases

In [26]:
names(Mt)[8] <- "Tot"

In [27]:
head(Mt)

Unnamed: 0,"Central Europe, Eastern Europe, and Central Asia",High-income,Latin America and Caribbean,North Africa and Middle East,South Asia,"Southeast Asia, East Asia and Oceania",Sub-Saharian Africa,Tot
Tuberculosis,21,91,39,19,54,70,151,365
HIV/AIDS,76,1076,105,35,64,145,395,1647
"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",288,2534,248,253,357,437,150,3696
Malaria,0,71,12,0,48,68,321,482
Neglected tropical diseases excluding malaria,5,136,78,36,50,65,73,428
Maternal disorders,13,511,48,387,79,84,43,1126


In [28]:
tail(Mt)

Unnamed: 0,"Central Europe, Eastern Europe, and Central Asia",High-income,Latin America and Caribbean,North Africa and Middle East,South Asia,"Southeast Asia, East Asia and Oceania",Sub-Saharian Africa,Tot
Congenital anomalies,69,1409,55,147,53,143,16,1756
Skin and subcutaneous diseases,359,3289,207,430,343,349,98,4463
Sense organ diseases,365,5256,304,366,301,493,106,6444
Oral disorders,27,535,95,216,124,93,7,1068
Sudden infant death syndrome,0,3,0,0,0,0,0,3
Tot,6998,67659,4766,8683,4848,9639,2585,91696


In [29]:
write.table(Mt,"Mapping_Cancer/Tables/RCTs_data_per_region_and_27_diseases_2005_2015.txt")


## Number of patients enrolled per region and disease
We count the number of patients enrolled per region and disease. 
1. We divide the number of patients across regions depending on nb of countries per region
2. We count several times patients enrolled in RCTs concerning several diseases (up to 3) to take into account co-morbidity
3. We will also count the number of patients enrolled per region, takinginto account trials concerning more than 3 dis.


In [30]:
data <- data0
GBD27 <- sapply(strsplit(as.character(data$GBD28),"&"),function(x){paste(x[x!="28"],collapse="&")})
data$GBD27 <- GBD27

#Trials relevant to the burden of diseases
data <- data[data$GBD27!="",]
#Trials with sample size information
data <- data[!is.na(data$Sample),]

dim(data)

In [33]:
summary(data$Sample)
table(data$Sample>=150000)
table(data$Sample<20)
#head(data[order(data$Sample,decreasing=TRUE),])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       1       48      100      770      234 20120000 


FALSE  TRUE 
85331    27 


FALSE  TRUE 
80863  4495 

In [34]:
data <- data[data$Sample>=20 & data$Sample<=150000,]

In [35]:
#We suppress the trial with more than 20million patients, it must be an error
#And trials with 1 patient enrolled (also an error)

dim(data)
summary(data$Sample)


    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    20.0     54.0    100.0    402.9    244.0 145000.0 

In [36]:
#Nb of patients per region
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')

In [37]:
length(unlist(nb_ctrs))
length(unlist(RGs))

In [38]:
dsp <- data.frame(
             nb_ctrs = unlist(nb_ctrs),
             sample = rep(data$Sample,sapply(nb_ctrs,length)),
             tt_nb_ctrs = rep(sapply(nb_ctrs,sum),sapply(nb_ctrs,length)))
dsp$reg <- unlist(RGs)

In [39]:
head(dsp)

Unnamed: 0,nb_ctrs,sample,tt_nb_ctrs,reg
1,1,1175,1,High-income
2,1,20,1,High-income
3,1,62,1,High-income
4,1,373,1,High-income
5,1,106,1,Latin America and Caribbean
6,1,50,1,High-income


In [40]:
dsp$spl_ctrs <- (dsp$sample*dsp$nb_ctrs)/dsp$tt_nb_ctrs

In [41]:
tot <- tapply(dsp$spl_ctrs,dsp$reg,sum)
tot
sum(dsp$spl_ctrs)
sum(data$Sample)


In [42]:
sort(tot)

In [43]:
#For each disease nb of patients per region
#We supress trials concerning more than 3 diseases
#Patients in trials concerning several diseases are counted several times
L <- lapply(strsplit(data$GBD27,"&"),as.numeric)
data <- data[sapply(L,length)<=4,]
L <- L[sapply(L,length)<=4]
dim(data)

In [44]:
nb_ctrs <- lapply(strsplit(as.character(data$Nb_ctr_per_reg),'&'),as.numeric)
RGs <-strsplit(as.character(data$Regions),'&')


In [45]:
library(gdata)

gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

Attaching package: ‘gdata’

The following object is masked from ‘package:data.table’:

    last

The following object is masked from ‘package:stats’:

    nobs

The following object is masked from ‘package:utils’:

    object.size

The following object is masked from ‘package:base’:

    startsWith



In [46]:
Mt <- tot
t0 <- proc.time()

for(i in 1:27){

    trs <- sapply(L,function(x){i %in% x})
    dsp <- data.frame(nb_ctrs = unlist(nb_ctrs[trs]),
                      sample = rep(data$Sample[trs],sapply(nb_ctrs[trs],length)),
                      tt_nb_ctrs = rep(sapply(nb_ctrs[trs],sum),sapply(nb_ctrs[trs],length)))
    dsp$reg <- unlist(RGs[trs])
    dsp$reg <- factor(dsp$reg)
    levels(dsp$reg) <- c(levels(dsp$reg),setdiff(levels(dsp$reg),RGS))
    dsp$reg <- reorder(dsp$reg,new.order=RGS)
    dsp$spl_ctrs <- (dsp$sample*dsp$nb_ctrs)/dsp$tt_nb_ctrs
   
    Mt <- rbind(Mt,tapply(dsp$spl_ctrs,dsp$reg,sum))

}
t1 <- proc.time() - t0
t1
#12s


   user  system elapsed 
 17.220   0.000  17.234 

In [47]:
rownames(Mt) <- c("Tot",as.character(Mgbd$cause_name)[-28])

In [48]:
Mt

Unnamed: 0,"Central Europe, Eastern Europe, and Central Asia",High-income,Latin America and Caribbean,North Africa and Middle East,South Asia,"Southeast Asia, East Asia and Oceania",Sub-Saharian Africa
Tot,1713426.7,20877389.2,932054.3,899010.8,1758042.6,3340414.4,3045884.0
Tuberculosis,4657.024,19467.7,46191.902,3501.17,20352.418,38466.521,293629.265
HIV/AIDS,16842.588,348941.491,19676.789,3156.296,64300.766,52819.9,396633.169
"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",139200.46,1717617.22,127516.11,33284.17,343141.78,235693.09,212975.16
Malaria,,5707.0,1509.35,,19380.31,91860.59,886637.75
Neglected tropical diseases excluding malaria,2005.0,49035.07,136902.43,6006.0,32386.0,79727.0,720456.5
Maternal disorders,6212.056,207783.2,21603.489,65770.478,141330.667,59289.167,118040.944
Neonatal disorders,4723.14,218263.89,12924.29,21759.18,191895.63,12550.24,71058.64
Nutritional deficiencies,19727.84,177183.23,28688.77,22856.03,171603.92,122520.83,187059.38
Sexually transmitted diseases excluding HIV,579.3333,142601.1667,8047.3333,9666.0,4663.0,24964.1667,9788.0


We add column Tot with the total number of patients included per disease


In [49]:
Mt[is.na(Mt)] <- 0

In [50]:
Mt <- data.frame(Mt)

In [51]:
Mt$Tot <- unlist(apply(Mt,1,sum))

In [52]:
Mt

Unnamed: 0,Central.Europe..Eastern.Europe..and.Central.Asia,High.income,Latin.America.and.Caribbean,North.Africa.and.Middle.East,South.Asia,Southeast.Asia..East.Asia.and.Oceania,Sub.Saharian.Africa,Tot
Tot,1713426.7,20877389.2,932054.3,899010.8,1758042.6,3340414.4,3045884.0,32566222.0
Tuberculosis,4657.024,19467.7,46191.902,3501.17,20352.418,38466.521,293629.265,426266.0
HIV/AIDS,16842.588,348941.491,19676.789,3156.296,64300.766,52819.9,396633.169,902371.0
"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",139200.46,1717617.22,127516.11,33284.17,343141.78,235693.09,212975.16,2809428.0
Malaria,0.0,5707.0,1509.35,0.0,19380.31,91860.59,886637.75,1005095.0
Neglected tropical diseases excluding malaria,2005.0,49035.07,136902.43,6006.0,32386.0,79727.0,720456.5,1026518.0
Maternal disorders,6212.056,207783.2,21603.489,65770.478,141330.667,59289.167,118040.944,620030.0
Neonatal disorders,4723.14,218263.89,12924.29,21759.18,191895.63,12550.24,71058.64,533175.0
Nutritional deficiencies,19727.84,177183.23,28688.77,22856.03,171603.92,122520.83,187059.38,729640.0
Sexually transmitted diseases excluding HIV,579.3333,142601.1667,8047.3333,9666.0,4663.0,24964.1667,9788.0,200309.0


In [53]:
head(Mt[order(Mt$Tot,decreasing=TRUE),c(1,ncol(Mt))]/1e6)

Unnamed: 0,Central.Europe..Eastern.Europe..and.Central.Asia,Tot
Tot,1.713427,32.566222
Neoplasms,0.2638351,6.532214
Cardiovascular and circulatory diseases,0.3776904,4.851601
"Diabetes, urinary diseases and male infertility",0.2687159,4.167593
"Diarrhea, lower respiratory infections, meningitis, and other common infectious diseases",0.1392005,2.809428
Mental and behavioral disorders,0.09350885,2.51528


In [54]:
tail(Mt[order(Mt$Tot,decreasing=TRUE),c(1,ncol(Mt))]/1e6)

Unnamed: 0,Central.Europe..Eastern.Europe..and.Central.Asia,Tot
Congenital anomalies,0.01177503,0.326955
Oral disorders,0.003384989,0.271083
Sexually transmitted diseases excluding HIV,0.0005793333,0.200309
Leprosy,0.001905067,0.05581
Hemoglobinopathies and hemolytic anemias,0.0002436667,0.022559
Sudden infant death syndrome,0.0,0.001377


In [55]:
write.table(Mt,"Mapping_Cancer/Tables/Patients_data_per_region_and_27_diseases_2005_2015.txt")