## Drug Use Frequency

### ASD Drugs Pharmacy Data
To evaluate the use of each of the target study drugs in the ASD cohort, we built a table for all of the insurance pharmacy claims data for every member in the ASD cohort.

In [None]:
library( ggplot2 )
library( ggalluvial )
library( stringr )
library( UpSetR )
library(RColorBrewer)
library(networkD3)
library(webshot)

First, we read the file with the medication names and groups that we will study and we create table with the pharmacy claims data for every member of the ASD cohort

In [None]:
medInputList <- read.delim("./medInputList", header = TRUE, sep = "\t", colClasses = "character")
groups <- as.character( unique( medInputList$Group))

dbSendUpdate( cn, "SELECT Fills.*
INTO ASDPharmacyClaims
FROM ASDMembers, PharmacyClaims Fills
WHERE ASDMembers.MemberId = Fills.MemberId")

Then, we mapped all of the individual study ASD-associated drugs to the many variations of National Drug Codes (NDC) to ensure that we had every NDC linked to each drug. Each drug’s NDC map was used to obtain the pharmacy claims data for each member to calculate the number of annual pharmacy claims for each drug.

To build a Sankey Diagram that depicts individual member changes in the ASD study drugs over time, we used our NDC maps unique to each drug to obtain the pharmacy claims data associated with each of the seven target study drugs.

In [None]:
for( i in 1:length( groups )){
  print(i)
  meds <- paste( tolower(medInputList[ medInputList$Group == groups[i], "medicationName"]), collapse="%' OR LOWER(NdcDescription)  like '%")
  query <- paste0( "SELECT DISTINCT NationalDrugCode, '",groups[i],"' AS DrugName INTO ",groups[i],"DrugCodeMap FROM ASDPharmacyClaims WHERE LOWER(NdcDescription) like '%",meds, "%'")
  dbSendUpdate( cn, paste0("DROP TABLE IF EXISTS ", groups[i],"DrugCodeMap"))
  dbSendUpdate( cn, query)
  
  countQuery <- paste0("SELECT YEAR ( A.DispenseDate ) AS Year, COUNT(*) AS ",groups[i]," FROM ASDPharmacyClaims A, ", groups[i],"DrugCodeMap ","B WHERE A.NationalDrugCode = B.NationalDrugCode AND B.DrugName= '",groups[i],"' GROUP BY YEAR ( A.DispenseDate ) ORDER BY YEAR ( A.DispenseDate )")
  if( i == 1){
    outputCount <- dbGetQuery( cn, countQuery)
  }else{
    intermediateCount <- dbGetQuery( cn, countQuery)
    outputCount <- merge( outputCount, intermediateCount, by = "Year", all.x = TRUE, all.y = TRUE)
  }
  
  #table with pharmacy claims for each member in the asd cohort
  dbSendUpdate( cn, paste0("DROP TABLE IF EXISTS ", groups[i],"ASD"))
  dbSendUpdate(cn, paste0("SELECT DISTINCT( MemberId ) INTO ",groups[i],"ASD FROM ASDPharmacyClaims WHERE NationalDrugCode IN (SELECT NationalDrugCode FROM ",groups[i],"DrugCodeMap )"))

}
write.table( outputCount, file="./medOutputCount.txt", col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)


We create a table with the pharmacy claims data for each member in the ASD cohort

In [None]:
allMeds <- paste( tolower( unique(medInputList$medicationName)), collapse="%' OR LOWER(NdcDescription)  like '%")
dbSendUpdate( cn, "DROP TABLE IF EXISTS PharmacySubsetTest")
dbSendUpdate( cn, paste0("SELECT MemberId, DispenseDate, NdcDescription 
                    INTO PharmacySubsetTest
                    FROM ASDPharmacyClaims
                    WHERE
                    NdcDescription like '%", allMeds,"%'" ))

Select only those from 2014 to 2019

In [None]:
dbSendUpdate( cn, "SELECT *
                      INTO PharmacySubsetTest2014
                      FROM PharmacySubsetTest
                      WHERE YEAR(DispenseDate) > 2013
                      ORDER BY MemberId, YEAR (DispenseDate)")

We extract the information and prepare the data as required for the Sankey diagram (TBA). 

In [None]:
drugData <- dbGetQuery( cn, "SELECT * FROM agf9.dbo.PharmacySubsetTest2014")

drugData$therapy <-  sapply(strsplit( as.character(drugData$NdcDescription), " "), '[', 1)
drugData$timeperiod <-  sapply(strsplit( as.character(drugData$DispenseDate), "[-]"), '[', 1)
drugData <- drugData[ drugData$timeperiod < 2020, ]
drugDataSubset <- unique( drugData[ , c("MemberId", "timeperiod", "therapy")] )
drugDataSubset$therapy <- as.factor(drugDataSubset$therapy)
drugDataSubset$MemberId <- as.factor(drugDataSubset$MemberId)
drugDataSubset$timeperiod <- as.numeric( drugDataSubset$timeperiod )

# consider the patients taking only one drug per year
drugDataSubset$pair <- paste0(drugDataSubset$MemberId, "-", drugDataSubset$timeperiod)
output <- as.data.frame( table( drugDataSubset$pair ))
onePerYear <- output[ output$Freq ==1, ]
subset <- drugDataSubset[ drugDataSubset$pair %in% onePerYear$Var1, ]

# save the subset
save(subset, file = "./outputGraphic.RData")

Plotting Sankey table

In [None]:
randomColor <- function(num){
  # get the most distinctive color codes
  qual_color_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
  color_vector = unlist(mapply(brewer.pal, qual_color_pals$maxcolors, rownames(qual_color_pals)))
  if (num > length(color_vector))
    color_vector=sample(color_vector, num, replace = TRUE)
  else
    color_vector=color_vector[1:num]

  color_vector <- col2rgb(color_vector,alpha = FALSE)

  color <- paste0(color_vector[1:3,1],collapse=',')
  colorVector_n <- paste("rgba(",color,",1)",sep="")
  colorVector_l <- paste("rgba(",color,",0.4)",sep="")

  for (i in 1:(num-1)){
    color <- paste0(color_vector[1:3,i+1],collapse=',')
    colorVector_n <- c(colorVector_n,paste("rgba(",color,",1)",sep=""))
    colorVector_l <- c(colorVector_l,paste("rgba(",color,",0.4)",sep=""))
  }

  #  # for having random colors
  # color <- paste0(floor(runif(3, min=0, max=255)),collapse=',')
  # colorVector_n <- paste("rgba(",color,",1)",sep="")
  # colorVector_l <- paste("rgba(",color,",0.4)",sep="")
  # for (i in 1:(num-1)){
  #   color <- paste0(floor(runif(3, min=0, max=255)),collapse=',')
  #   colorVector_n <- c(colorVector_n,paste("rgba(",color,",1)",sep=""))
  #   colorVector_l <- c(colorVector_l,paste("rgba(",color,",0.4)",sep=""))
  #  }
  #
  return(list(nodeColor=colorVector_n,linkColor=colorVector_l))
}
#randomColor(90)
# ---------------------------------
### plot in R studio (out of o2)
#load("./outputGraphic.RData")

# show sankey with/without drop off
dropoff=TRUE # TRUE FALSE

start_year <- 2014
end_year <- 2019
years <- c(start_year:end_year)
years <- sort(unique(subset$timeperiod))

# sort the groups based on Primary Symptoms condition
medInputList <- medInputList[order(medInputList$Primary.Symptoms.Condition),]
groups <- as.character(unique(medInputList$Group))

# replace each medication with its group
for (i in 1:nrow(medInputList)) {
  subset$therapy[grepl(paste('^',medInputList$medicationName[i],sep = ""),toupper(subset$therapy))]<- medInputList$Group[i]
}
treatments = levels(droplevels(unique(subset$therapy)))
# they should be same!
stopifnot(sort(treatments)==sort(groups))
treatments = groups

treatms_df <- list()
# structuring the data for Sankey
for( i in 1:(length(years)-1)){
  na_in_year = 0
  for( j in 1:length(treatments)){

    source <- paste0(years[i], "-", treatments[j])
    subsetPreYear <- subset[subset$timeperiod==years[i] & subset$therapy==treatments[j],]
    t_value = nrow(subsetPreYear)

    # if (i==1){
    # source_0 <- paste0("Year ",years[i])
    # if (t_value!=0)
    #   links <- rbind(links,list(c(source_0,source,t_value)))
    # }
    t_0 = 0

    for( k in 1:length(treatments)){
      target <- paste0(years[i+1], "-", treatments[k])
      value <- nrow(subset[subset$timeperiod==years[i+1] & subset$therapy==treatments[k] & subset$MemberId %in% subsetPreYear$MemberId,])
      if (value!=0){
        treatms_df <- rbind(treatms_df,list(c(source,target,value)))
      }
      t_0 <- t_0 + value
    }
    if (t_value-t_0 >0 & dropoff){
      target <- paste0("NA in ",years[i+1])
      treatms_df <- rbind(treatms_df,list(c(source,source,t_value-t_0)))
      na_in_year <- na_in_year + t_value-t_0
    }

  }

}

treatms_df <- data.frame(matrix(unlist(treatms_df), nrow=length(treatms_df), byrow=T),stringsAsFactors=FALSE)
colnames(treatms_df) <- c('source','target','value')
treatms_df <- as.data.frame(treatms_df)
nodes <- data.frame(
  name=c(as.character(treatms_df$source), as.character(treatms_df$target)) %>% unique()
)

# order the nodes
#nodes <-  data.frame(name=nodes[order(nodes$name),])
treatms_df$IDsource <- match(treatms_df$source, nodes$name)-1
treatms_df$IDtarget <- match(treatms_df$target, nodes$name)-1
nodes$name <- sub('^[0-9]+-', '', nodes$name)
#nodes$name <- sapply(strsplit( as.character(nodes$name), "[-]"), '[', 2)

# Add a 'group' column to each link
treatms_df$group <- paste0(sapply(strsplit( as.character(treatms_df$source), "[-]"), '[', 2),'_L')
treatms_df$group[treatms_df$IDsource==treatms_df$IDtarget]="SELFNODELINK"

# Add a 'group' column to each node
nodes$group <-as.factor(nodes$name)

# Give a color for each group:
node_linkColor <- randomColor(length(treatments))
nodeColors <- node_linkColor$nodeColor
linkColors <- node_linkColor$linkColor
paste0("'",paste(paste(treatments,"_L",sep = ""),collapse="','"),"'")

my_color <- paste("d3.scaleOrdinal().domain([",
                  paste0("'",paste(treatments,collapse="','"),"'"),",",
                  paste0("'",paste(paste(treatments,"_L",sep = ""),collapse="','"),"'"),
                  ",'SELFNODELINK']).range([",
                  paste0("'",paste(nodeColors,collapse="','"),"'"),",",
                  paste0("'",paste(linkColors,collapse="','"),"'"),
                  ",'rgba(211, 211, 211, 0)'])"
)


# plot sankey diagram
sn <- sankeyNetwork(Links=treatms_df, Nodes=nodes, Source='IDsource', Target='IDtarget', Value = 'value', NodeID = 'name', colourScale=my_color,
                    NodeGroup="group",LinkGroup="group",
                    fontSize= 9, nodeWidth = 15,sinksRight = FALSE)#,iterations = 0)

# Save the plot
saveNetwork(sn, "sankey_plot.html")

# Use Viewer tab --> Export --> save as image (adjust width and height!)

# you may need to run this line: webshot::install_phantomjs()
# webshot("sankey_plot.html", "sankey_plot.png", vwidth = 1096, vheight = 826,)

# onRender(
#   sn,
#   '
#   function(el, x) {
#     d3.selectAll(".node text").attr("text-anchor", "begin").attr("x", 20);
#   }
#   '
# )

### ASD Drug Use Over Time
To analyze the use of each target study drug over time, we obtained a count of distinct members from the ASD cohort that also had valid pharmacy claims between 2014 and 2019.

In [None]:
for( i in 1:length( groups )){
  print( i )
  meds <- paste( tolower(medInputList[ medInputList$Group == groups[i], "medicationName"]), collapse="%' OR LOWER(NdcDescription)  like '%")
  if( i ==1 ){
    queryStart <- paste0( "SELECT MemberId, YEAR(DispenseDate) AS DispenseYear,
                          sum( case when NdcDescription like '%", meds, "%' then
                          1 else 0 end) as n_", groups[i] )
  }else if( i != length(groups)){
    queryStart <- paste0( queryStart, ", sum( case when NdcDescription like '%", meds, "%' then
                          1 else 0 end) as n_", groups[i])
  }else{
    queryStart <- paste0( queryStart, ", sum( case when NdcDescription like '%", meds, "%' then
                          1 else 0 end) as n_", groups[i], " into PharmacySubsetTest2014to2019_counts
                          from PharmacySubsetTest2014 group by MemberID, YEAR(DispenseDate)")
  } 
}

dbSendUpdate( cn, "DROP TABLE IF EXISTS PharmacySubsetTest2014to2019_counts")
dbSendUpdate( cn, queryStart)

Single- and Two-Drug Regimen Use Across All Years (2014 -2019)

To obtain the counts for members taking only one of the target study drugs (e.g., methylphenidate only; without pharmacy claims for atomoxetine, guanfacine, etc.). The sum of distinct member counts between 2014 and 2019 were obtained.

Second, we used a similar query to determine the number of distinct members from this same sample subset that were on a two-drug regimen (e.g., methylphenidate and atomoxetine, without prescriptions for the other target drugs). The sum of distinct member counts between 2014 and 2019 were obtained.

In [None]:
for( i in c(2014:2019)){
  print(i)
  dyear <- dbGetQuery( cn, paste0("SELECT * FROM PharmacySubsetTest2014to2019_counts where DispenseYear = ", i))
  dyear[3:26] <- lapply(dyear[3:26] , function(x) replace(x,x > 0, "Yes") )
  dyear[3:26] <- lapply(dyear[3:26] , function(x) replace(x,x %in% 0, "No") )
  
  dyear$combination <- apply( dyear[ , c(3:26) ] , 1 , paste , collapse = "-" )
  dyear$counts <- str_count(dyear$combination, "Yes")
  
  dyearSubset <- dyear[ dyear$counts == 1 | dyear$counts == 2, ]
  output <- as.data.frame( summary(as.factor(dyearSubset$combination)))
  output$combination <- NA
  
  drugs <- colnames(dyearSubset)[3:26]
  drugs <- gsub( "n_", "", drugs)
  
  for( j in 1:nrow(output)){
    output$combination[j] <- paste( drugs[ which( unlist(strsplit( rownames(output)[j], "-")) == "Yes")], collapse = "&" )
  }
  rownames(output) <- c()
  colnames(output) <- c("Count", "Combination")
  output$year <- i
  
  if( i == 2014){
    final <- output
  }else if(i > 2014){
    final <- rbind( final, output)
  }
  
}

#### UpSetR plots
We prepare the data to be plot using UpSetR (https://github.com/hms-dbmi/UpSetR), extracting the information by year and putting all together in a table that we called "totalData".

In [None]:
allGroups<- unique(medInputList$Group)

for( i in c(2014:2019)){
  print(i)
  dataSelection <- final[ final$year == i, "Count"]
  names(dataSelection) <- final[ final$year == i, "Combination"]
  dataSelection <- fromExpression( dataSelection )
  dataSelection$x <- paste0( "Year_", i)
  
  missingDrugs <- allGroups[! allGroups %in% colnames(dataSelection)]
  dataSelection[ , missingDrugs] <- 0
  
  if( i == 2014){
    totalData <- dataSelection
  }else{
    totalData <- rbind( totalData, dataSelection)
  }
}

Finally we plot the UpSetR. 

In [None]:
totalData$x <- as.factor(totalData$x)

upset( as.data.frame(totalData),
                 queries = list(
                   list(query = elements, 
                        params = list("x", c("Year_2019","Year_2018", "Year_2017", "Year_2016","Year_2015", "Year_2014")), color = "#b54e75", active = T),
                   list(query = elements, 
                        params = list("x", c("Year_2018", "Year_2017","Year_2016", "Year_2015", "Year_2014")), color = "#e69f00", active = T),
                   list(query = elements, 
                        params = list("x", c("Year_2017","Year_2016", "Year_2015", "Year_2014")), color = "#58ad97", active = T),
                   list(query = elements, 
                        params = list("x", c("Year_2016", "Year_2015", "Year_2014")), color = "#566fa8", active = T),
                   list(query = elements, 
                        params = list("x", c("Year_2015", "Year_2014")), color = "#2a2369", active = T),
                   list(query = elements, 
                        params = list("x", c("Year_2014")), color = grey(0.7), active = T)
                   
                 ), 
       nsets = 25,
       order.by = "freq",
       cutoff = 0,
       query.legend = "bottom",
       point.size = 1.1,
       line.size = 0.35,
       text.scale = 0.5
)