In [10]:
library(AnnotationGx)
library(data.table)
library(httr)
library(jsonlite)

In [2]:
ctrp_pubchem <- readRDS("/home/bioinf/bhklab/jermiah/projects/annotationScripts/results/ctrp/pubchem_preproccessed.RDS")

ctrp.treatment.metadata <- ctrp_pubchem$ctrp.treatment.metadata
successful_master_cpd_ids <- ctrp_pubchem$successful_master_cpd_ids
failed_master_cpd_ids <- ctrp_pubchem$failed_master_cpd_ids

In [6]:
merged <- merge.data.table(ctrp.treatment.metadata ,successful_master_cpd_ids, by.x = "master_cpd_id", by.y = "master_cpd_id", all.x = TRUE, all.y = FALSE)


In [12]:
buildURL <- function(...) paste0(na.omit(unlist(list(...))), collapse='/')

parseJSON <- function(response, ..., encoding='UTF-8', query_only=FALSE) {
    if (isTRUE(query_only)) return(response)
    tryCatch({
        fromJSON(content(response, ..., as='text', type='JSON',
            encoding=encoding))
    },
    error=function(e) {
        fromJSON(content(response, ..., type='JSON', encoding=encoding))
    })
}

checkThrottlingStatus <- function(result){
    message <- headers(result)$`x-throttling-control`
    matches <- regmatches(message, gregexpr("\\((.*?)%\\)", message))  # Extracts text within parentheses
    percentages <- gsub("\\(|%|\\)", "", unlist(matches[1:3]))
    # print(percentages)
    percentage <- max(as.numeric(percentages))
    if(as.integer(percentage) > 15 && as.integer(percentage) < 30){
        Sys.sleep(15)
    }else if (as.integer(percentage) > 30 && as.integer(percentage) < 50){
        Sys.sleep(20)
    }else if (as.integer(percentage) > 50 && as.integer(percentage) < 75) {
        print(paste0("Throttling at ", percentage, "%. Sleeping for 30 seconds."))
        Sys.sleep(30)
    }else if (as.integer(percentage) > 75) {
        print(paste0("Throttling at ", percentage, "%. Sleeping for 60 seconds."))
        Sys.sleep(30)
    }else{
        Sys.sleep(1)
    }   
}

In [14]:
getPubChemCHEMBL <- function(
    compound,
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound',
    output = 'JSON',
    type = 'ChEMBL ID',
    timeout_s = 29,
    retries = 3,
    quiet = TRUE
    ){
        if(type == "DILI") queryURL <- paste0(buildURL(url, compound, output), '?heading=', "Drug Induced Liver Injury")
        else queryURL <- paste0(buildURL(url, compound, output), '?heading=', type)

        tryCatch({
            result <- RETRY('GET', URLencode(queryURL), times = retries, quiet = quiet)
        }, error=function(e) {
            print(paste0("Error: ", e$message))
            return(NULL)
        })
        
        checkThrottlingStatus(result)
        result <- parseJSON(result)

        if (type == 'ChEMBL ID') {
            result <- result$Record$Reference$SourceID
            result <- gsub("::Compound", "", result)
        }else if (type == 'NSC Number'){
            result <- result$Record$Reference$SourceID[1]
            result <- gsub(" ", "", result)
        }else if (type == 'DILI'){
            if(length(result$Record$Section) == 0){
                result <- "NA"
                
            }else{
                dt_ <- as.data.table(result$Record$Section)
                dt_ <- as.data.table(dt_)$Section[[1]]
                dt_ <- as.data.table(dt_)$Section
                dt_ <- as.data.table(dt_)
                dt_ <- as.data.table(dt_)$Information
                # print(as.data.table(dt_)[1:3,  .(Name,unlist(Value))])
                section <- as.data.table(dt_)[1:3, "DILI" := paste0(unlist(Name), ":", unlist(Value))]
                section <- paste0(section[1:4, DILI], collapse= "; ")

                # create a list for each row as Name:Value string with no spaces and no new lines
                reference <- paste0("LTKBID:", result$Record$Reference$SourceID)
                result <- c(section, reference)
                result <- paste0(result, collapse = "; ")
            }
        }else if (type == 'CAS'){
            result <- result$Record$Reference$SourceID[1]
        }else if (type == 'ATC Code'){
            if(length(result$Record$Section) == 0){
                result <- "NA"
                
            }else{dt_ <- as.data.table(result$Record$Section)
            dt_ <- as.data.table(dt_)$Section[[1]]
            dt_ <- as.data.table(dt_)$Information
            dt_ <- as.data.table(dt_)$Value
            dt_ <- as.data.table(dt_[1])
            result <- paste0("ATC:", dt_)}
        }
        
        if (is.null(result)) result <- list(compound, "N/A")
        else result <- list(compound,result)
        names(result) <- c("cid", type)
        return(result)
    }
merged <- merged[!lapply(merged$cids, is.null) == TRUE,]
test_CID <-  2375  

annotations <- c('ChEMBL ID', 'NSC Number', 'DILI', 'CAS', 'ATC Code')
# getPubChemCHEMBL(merged$cids[13], type = "DILI")
# lapply(annotations, function(x) as.data.table(getPubChemCHEMBL(test_CID, type = x)))

In [17]:
# merged$cids[1:10]

annotations <- c('ChEMBL ID', 'NSC Number', 'DILI', 'CAS', 'ATC Code')
# parallelRun<- BiocParallel::MulticoreParam(workers=8, progressbar=TRUE, stop.on.error=FALSE)

result <- 
    suppressWarnings(rbindlist(
        BiocParallel::bptry(
                BiocParallel::bplapply(
            c(merged$cids[1:50]), function(y){
                BiocParallel::bptry(
                    annotationCID <- BiocParallel::bplapply(annotations, function(x) {
                        as.data.table(getPubChemCHEMBL(y, type = x))
                        },
                        BPPARAM = BiocParallel::MulticoreParam(workers = 5, stop.on.error=FALSE)
                    )
                )
                Reduce(function(x, y) merge(x, y, allow.cartesian = TRUE), annotationCID)
            },
            BPPARAM = BiocParallel::MulticoreParam(workers = 6, progressbar = TRUE, stop.on.error=FALSE)
        )
    ),
    fill = TRUE,
))
# save result to RDS file 
saveRDS(result, "ctrp_pubchem_annotations.RDS")




In [19]:
result

cid,ChEMBL ID,NSC Number,DILI,CAS,ATC Code
<int>,<chr>,<chr>,<chr>,<chr>,<chr>
6623618,CHEMBL492468,,,,
7326481,CHEMBL1300397,,,,
1641662,,,,,
2842253,CHEMBL1526042,,,,
2729026,CHEMBL1402326,,,,
613000,CHEMBL4303295,,,32703-82-5,
444795,CHEMBL38,NSC759631,Compound:tretinoin; DILI Annotation:Ambiguous DILI-concern; Severity Grade:3; NA; LTKBID:LT00338,302-79-4,ATC:D10AD51
64971,CHEMBL269277,NSC677578,,472-15-1,
4788,CHEMBL45068,NSC407292,,60-82-2,
5426,CHEMBL468,NSC758479,Compound:thalidomide; DILI Annotation:Less-DILI-Concern; Severity Grade:4; NA; LTKBID:LT00450,21096,ATC:L04AX02


In [94]:
test <- merged[540:545, c("cpd_name","cids")]


test



cpd_name,cids
<chr>,<list>
vincristine,5978
vorapaxar,10077130
vorinostat,5311
zebularine,100016


In [21]:
subs[lapl]

In [3]:
# get all paths in metadata folder
metadata_paths <- list.files(path = "metadata", pattern = "*.RDS", full.names = TRUE)
print(metadata_paths)

[1] "metadata/cellosaurus.RDS"                           
[2] "metadata/pubchem_annotations_ATC Code.RDS"          
[3] "metadata/pubchem_annotations_FDA Approved Drugs.RDS"
[4] "metadata/pubchem_annotations_NSC Number.RDS"        


In [16]:
# read in 2-9
atc <- as.data.table(readRDS(metadata_paths[2]))
parsed_atc<- AnnotationGx:::.parseATCannotations(atc)
print(dim(parsed_atc))
parsed_atc[1:5]

[1] 1586    4


SourceName,SourceID,ATC_code,CID
<chr>,<chr>,<chr>,<int>
European Medicines Agency (EMA),EMEA/H/C/000073_1,L01CD02,148124
European Medicines Agency (EMA),EMEA/H/C/000082_1,L04AA06,5281078
European Medicines Agency (EMA),EMEA/H/C/000088_1,"A10AB04, A10AD04",118984450
European Medicines Agency (EMA),EMEA/H/C/000089_1,L01DB,31703
European Medicines Agency (EMA),EMEA/H/C/000089_1,L01DB,443939


In [21]:
names(dili)

In [22]:
dili$Data