# Sex differences in Autism Spectrum Disorder, a Comorbidity Pattern Analysis in National Scale Data: validation in electronic health record data

## Load the libraries

In [3]:
library( "RJDBC" )
library( "knitr" )
library( "dplyr" )
library( "tidyr" )
library("lubridate")

## Set up the DB server connection

In [None]:
connectToDB <- function( driverClss, classPth, usr, psw, outputPath, verbose = FALSE ){
    
    drv <- RJDBC::JDBC( driverClass = driverClss , 
                        classPath   = classPth )
    
    conn <<- RJDBC::dbConnect( drv, srv, usr, psw )
  
}

connectToDB(
  driverClss  = XXXX,
  classPth    = XXXX,
  usr         = XXXX,
  psw         = XXXX,
  outputPath  = XXXX
)

## Patient selection 
Select those patients with at ICD9-CM information. 

In [None]:
icd9Patients <- dbGetQuery( conn, "SELECT PATIENT_NUM, START_DATE, CONCEPT_CD 
                                   FROM OBS_FACT 
                                   WHERE CONCEPT_CD LIKE 'ICD9:%'" )


### Remove those patients that could be in the claim database
To guarante that the two sets analyzed are completely independent we remove those patients from the electronic health record database that could be potentially in the claim database. To do this we look for those concepts that contain the name of the claim database, and remove those patients that have associated those Concepts identifiers. 

In [None]:
mydataselection <- dbGetQuery(conn, "SELECT PATIENT_NUM,SEX_CD,BIRTH_DATE 
                                     FROM PATIENT_DIM
                                     WHERE PATIENT_NUM 
                                     NOT IN (SELECT distinct PATIENT_NUM  
                                             FROM OBS_FACT 
                                             WHERE CONCEPT_CD IN  ('INS:4325','INS:2','INS:3','INS:7070','INS:4100'))")

Merge both data sets, to have all the information (patient identifier, sex, birth date, start date and ICD code) for the selected patients. 

In [None]:
dataSet <- inner_join( mydataselection, icd9Patients, by = "PATIENT_NUM" )      

#SELECT distinct PATIENT_NUM FROM ASD_GI.OBSERVATION_FACT 
# WHERE CONCEPT_CD IN ('ICD9:299.0','ICD9:299.01','ICD9:299.00','ICD9:299.80','ICD9:299.8','ICD9:299.81','ICD9:299.9','ICD9:299.90','ICD9:299.91');

Select those diagnoses done from January 1st 2008. 

In [None]:
dataSet <- dataSet[ dataSet$START_DATE >= '2008-01-01', ]

Select those patients diagnosed with ASD from 0 to 18 years old.


In [None]:
dataSet$Age <- floor( as.numeric( ( as.Date( dataSet$START_DATE ) - as.Date( dataSet$BIRTH_DATE ) ) )/ 365 )
dataSet     <- dataSet[ dataSet$Age >= 0 & dataSet$Age <= 18, ]

From those, select the patients with a coverage of 12 months between 0 to 18 years old.

In [None]:
#estimate the months of information
dataSet$monthsOfInformation <- paste0( sapply( 
                                        strsplit( as.character( dataSet$START_DATE ), "-" ) , '[', 1 ), "-", 
                                        sapply( strsplit( as.character( dataSet$START_DATE ), "-" ), '[', 2 ) )
monthsOfInfo <- dataSet[, c("PATIENT_NUM", "monthsOfInformation")]
monthsOfInfo <- monthsOfInfo[!duplicated( monthsOfInfo), ]
months <- as.data.frame( table( monthsOfInfo$PATIENT_NUM ) )
selection <-months[ months$Freq > 12, "Var1"]


#select those patients with at least 12 months of information
dataSet <- dataSet[ dataSet$PATIENT_NUM %in% selection ]

Generate two different data sets, one with those patients with Autism Spectrum Disorder ICD9-CM codes (ASD patients) and another one with those patients without ASD. 

In [None]:
autismIcd9codes <- c( '299.0', '299.00', '299.01', '299.8', '299.80', '299.81', '299.9', '299.90', '299.91' )
asdPatients <- dataSet[ dataSet$CONCEPT_CD %in% autismIcd9codes, "PATIENT_NUM"]

AsdData   <- dataSet[ dataSet$PATIENT_NUM %in% asdPatients, ]
noAsdData <- dataSet[ ! dataSet$PATIENT_NUM %in% asdPatients, ] 

## Patients with ASD: group generation
Select patients diganosed at least 3 different times with an ASD ICD9-CM code 

In [None]:
asdBch3diag <- AsdData[ AsdData$code %in% autismIcd9codes, c("PATIENT_NUM", "START_DATE" )]
asdBch3diag <- asdBch3diag[! duplicated( asdBch3diag ), ]
asdBch3diag <- as.data.frame( table( asdBch3diag$PATIENT_NUM ) )
asdBch3diag <- asdBch3diag[ asdBch3diag$Freq >= 3, ]

ASD <- bchRawData[ AsdData$PATIENT_NUM %in% asdBch3diag$Var1, ]
length(unique(ASD$PATIENT_NUM))

Generate two sets, one for ASD females and another one for ASD males. 

In [None]:
femaleASD <- ASD[ ASD$SEX_CD == "F",  c( "PATIENT_NUM", "code", "START_DATE", "SEX_CD", "BIRTH_DATE", "age" ) ]
maleASD   <- ASD[ ASD$SEX_CD == "M",  c( "PATIENT_NUM", "code", "START_DATE", "SEX_CD", "BIRTH_DATE", "age" ) ]

colnames( femaleASD ) <- c( "MemberId", "Icd", "DateServiceStarted", "Gender", "BirthYear", "age")
colnames( maleASD )   <- c( "MemberId", "Icd", "DateServiceStarted", "Gender", "BirthYear", "age")

femaleASD$Caco <- "Female"
maleASD$Caco   <- "Male"

## Patients without ASD: group generation
Generate two sets, one for females without ASD and another one for males without ASD

In [None]:
femaleNonASD <- noAsDBch[ noAsdData$SEX_CD == "F", 
                           c("PATIENT_NUM", "CONCEPT_CD","START_DATE", "SEX_CD", "BIRTH_DATE", "Age")]
maleNonASD <- noAsdData[ noAsdData$SEX_CD == "M", 
                           c("PATIENT_NUM", "CONCEPT_CD","START_DATE", "SEX_CD", "BIRTH_DATE", "Age")]

#reformat the ICD codes properly (ICD:XXX to XXX)
femaleNonASD$CONCEPT_CD <- sapply(strsplit( as.character( femaleNonASD$CONCEPT_CD ), "[:]"), '[', 2 )
maleNonASD$CONCEPT_CD   <- sapply(strsplit( as.character( maleNonASD$CONCEPT_CD ), "[:]" ), '[', 2 )
 
#re-name the column names
colnames( femaleNonASD ) <- c( "MemberId", "Icd","DateServiceStarted", "Gender", "BirthYear", "age" )
colnames( maleNonASD )   <- c( "MemberId", "Icd","DateServiceStarted", "Gender", "BirthYear", "age" )

maleNonASD$Caco   <- "MaleNonASD"
femaleNonASD$Caco <- "FemaleNonASD"

## Comorbidity analysis
We create a function, that we called "comorbidityAnalysis" that allow us to run the comorbidity analysis having as input:
- two groups to compare: cases and controls
- the age range
- a symbol to define/identify cases and controls
- the minimum number of times of diagnosis to consider a patient having a phenotype

As an output it generates a data frame, with one PheWAS code per row, and the number of cases and controls having or not the phecode. It also shows the p-value (corrected by bonferroni), the confidence interval and the odds ratio. 

In [5]:
comorbidityAnalysis <- function( cases, controls, minAge, maxAge, casesSymbol, controlsSymbol, minimunDiagnosis = 3){
    
    myCompleteData <- rbind( cases, controls )
  
    #select diagnostics in the age range
    myCompleteData <- myCompleteData[ myCompleteData$age >= minAge &
                                      myCompleteData$age <= maxAge, ]
    
    #map from the ICD9-CM to the PheWAS category
    if(! "Phenotype" %in% colnames( myCompleteData ) ){
        
        phemapFile <- read.csv( "phecode_icd9_rolled.csv", header =TRUE, colClasses = "character" )
        phemapFile <- phemapFile[, c( "ICD9", "PheCode") ]
        colnames( phemapFile ) <- c( "Icd", "Phenotype")
        
        myCompleteDataFinal <- inner_join( myCompleteData, phemapFile, by = "Icd" )
        rm( myCompleteData )
    }else{
        myCompleteDataFinal <- myCompleteData
        rm( myCompleteData )
    }
 
    
    #remove males with females ASD diagnosis
    message( "Remove males with females ASD diagnosis ..." )
    sexSpecificPheCodes    <- read.csv("phecode_definitions1.2.csv", header = TRUE, colClasses = "character" )
    femaleSpecificPheCodes <- sexSpecificPheCodes[ sexSpecificPheCodes$sex == "Female", ]
    maleSpecificPheCodes   <- sexSpecificPheCodes[ sexSpecificPheCodes$sex == "Male", ]
    
    #remove females with males ASD diagnosis and viceversa
    message( "Remove females with males ASD diagnosis ..." )
    femaleDiagnosisError <- myCompleteDataFinal[ myCompleteDataFinal$Phenotype %in% maleSpecificPheCodes$jd_code & 
                                                 myCompleteDataFinal$Gender == "F", ]

    myCompleteDataFinal <- myCompleteDataFinal[ ! myCompleteDataFinal$MemberId %in% femaleDiagnosisError$MemberId, ]
    
    message( paste0( "Total number of patients in the data set", length( unique( myCompleteDataFinal$MemberId ) ) ) )
    message( paste0( "Total number of cases: ", 
                    length( unique( myCompleteDataFinal[ tolower( myCompleteDataFinal$Caco ) == 
                                                         tolower( casesSymbol ), "MemberId" ] ) ) ) )
    message( paste0( "Total number of controls: ", 
                    length( unique( myCompleteDataFinal[ tolower( myCompleteDataFinal$Caco ) == 
                                                        tolower( controlsSymbol ), "MemberId"] ) ) ) )
    
    # PheWAS phenotype dataframe: id, phewasCode and counts #
    message("Generating the file containing patient identifier, phewas code and counts ...")
    phenotypeSubset <- myCompleteDataFinal[, c( "MemberId", "Phenotype", "DateServiceStarted" ) ]
    phenotypeSubset <- phenotypeSubset[! duplicated( phenotypeSubset ), ]
    phenotype <- phenotypeSubset[, c( "MemberId", "Phenotype" ) ]
    colnames( phenotype ) <- c( "patientId", "phecode" )
  
    phenotype$pair <- paste( phenotype$patientId, phenotype$phecode, sep = "*" )
    counts <- as.data.frame( table( phenotype$pair ) )
    colnames( counts ) <- c("pair", "counts")
  
    phenotypeFinal <- inner_join( phenotype, counts, by = "pair" )
    phenotypeFinal <- phenotypeFinal[! duplicated(phenotypeFinal), c("patientId", "phecode", "counts")]
    
    caco <- myCompleteDataFinal[ , c("MemberId", "Caco")]
    colnames( caco ) <- c("patientId", "Caco")
    caco <- caco[!duplicated(caco), ]
    finaleSet4ComorbidityAnalysis <- inner_join( phenotypeFinal, caco )
    
    #create a table to save for each phenotype the patients having or not the phecode
    totalPhewasCodes <- unique( finaleSet4ComorbidityAnalysis$phecode )
    myPhewasResults <- as.data.frame( matrix(ncol=7, nrow=length(totalPhewasCodes)))
    colnames( myPhewasResults ) <- c( "phecode", "caseYes", "caseNo", "caseExclude", 
                                      "controlYes", "controlNo", "controlExclude" )
    
    myPhewasResults$phecode <- as.character( totalPhewasCodes )

    for(i in 1:nrow(myPhewasResults ) ){
      #print(i)
      myPhewasResults$caseYes[i] <- length(unique(finaleSet4ComorbidityAnalysis[ finaleSet4ComorbidityAnalysis$phecode == myPhewasResults$phecode[i] &
                                                                tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(casesSymbol) &
                                                                finaleSet4ComorbidityAnalysis$counts >= minimunDiagnosis, "patientId"]))

      myPhewasResults$caseExclude[i] <- length(unique(finaleSet4ComorbidityAnalysis[ finaleSet4ComorbidityAnalysis$phecode == myPhewasResults$phecode[i] &
                                                                tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(casesSymbol) &
                                                                finaleSet4ComorbidityAnalysis$counts < minimunDiagnosis, "patientId"]))


      myPhewasResults$caseNo[i] <- length(unique(finaleSet4ComorbidityAnalysis[tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(casesSymbol), "patientId"])) - myPhewasResults$caseExclude[i] - myPhewasResults$caseYes[i]

       
        
      myPhewasResults$controlYes[i] <- length(unique(finaleSet4ComorbidityAnalysis[ finaleSet4ComorbidityAnalysis$phecode == myPhewasResults$phecode[i] &
                                                                tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(controlsSymbol) &
                                                                finaleSet4ComorbidityAnalysis$counts >= minimunDiagnosis, "patientId"]))

      myPhewasResults$controlExclude[i] <- length(unique(finaleSet4ComorbidityAnalysis[ finaleSet4ComorbidityAnalysis$phecode == myPhewasResults$phecode[i] &
                                                                tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(controlsSymbol) &
                                                                finaleSet4ComorbidityAnalysis$counts < minimunDiagnosis, "patientId"]))


      myPhewasResults$controlNo[i] <- length(unique(finaleSet4ComorbidityAnalysis[tolower(finaleSet4ComorbidityAnalysis$Caco) == tolower(controlsSymbol), "patientId"]))- myPhewasResults$controlExclude[i] -  myPhewasResults$controlYes[i]
       
    }
    
    #create a function to perform a fisher test and extract the p-value, confidence interval and OR
    get_fisher <- function(df){
      mat <- matrix(as.numeric(unlist(df[c(2,3,5,6)])), ncol=2)
      f <- fisher.test(as.table(mat), alt="two.sided")
      return(c(unlist(df[1]), f$p.value, f$conf.int, f$estimate))
    }

    #apply the function to the whole table
    fishers <- t(apply(myPhewasResults, 1,  get_fisher))
    colnames(fishers) <- c("phecode", "pValue", "confIntL", "confIntH", "OR" )

    #adjust the p-value by bonferroni
    finalTable         <- merge( myPhewasResults, fishers, by = "phecode" )
    finalTable$pAdjust <- p.adjust( as.numeric( as.character( finalTable$pValue ) ), method = "bonferroni" ) 
    
    return( finalTable )
}

Apply the function to the different subgroup analyzed

In [None]:
##################################################
# Comorbidity analysis: ASD females vs ASD males #
##################################################
maleASD$Caco <- "male"
femaleASD$Caco <- "female"

maleVsFemale0to2 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = maleASD, 
                                        minAge           = 0,
                                        maxAge           = 2,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( maleVsFemale0to2, file = "ASD_maleVsFemaleASD0to2.RData" )


maleVsFemale3to5 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = maleASD, 
                                        minAge           = 3,
                                        maxAge           = 5,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( maleVsFemale3to5, file = "ASD_maleVsFemaleASD3to5.RData" )

maleVsFemale6to11 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = maleASD, 
                                        minAge           = 6,
                                        maxAge           = 11,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( maleVsFemale6to11, file = "ASD_maleVsFemaleASD6to11.RData" )

maleVsFemale12to18 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = maleASD, 
                                        minAge           = 12,
                                        maxAge           = 18,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( maleVsFemale12to18, file = "ASD_maleVsFemaleASD12to18.RData" )

In [None]:
########################################################
# Comorbidity analysis: ASD females vs Non-ASD females #
########################################################
femaleNonASD$Caco <- "nonASD"
femaleASD$Caco <- "ASD"

asdVSnonAS0to2 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = femaleNonASD, 
                                        minAge           = 0,
                                        maxAge           = 2,
                                        casesSymbol        = "ASD", 
                                        controlsSymbol     = "nonASD", 
                                        minimunDiagnosis = 3)

save( asdVSnonAS0to2, file = "females_asdVSnonASD0to2.RData" )

asdVSnonAS3to5 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = femaleNonASD, 
                                        minAge           = 3,
                                        maxAge           = 5,
                                        casesSymbol        = "ASD", 
                                        controlsSymbol     = "nonASD", 
                                        minimunDiagnosis = 3)

save( asdVSnonAS3to5, file = "females_asdVSnonASD3to5.RData" )

asdVSnonAS6to11 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = femaleNonASD, 
                                        minAge           = 6,
                                        maxAge           = 11,
                                        casesSymbol        = "ASD", 
                                        controlsSymbol     = "nonASD", 
                                        minimunDiagnosis = 3)

save( asdVSnonAS6to11, file = "females_asdVSnonASD6to11.RData" )

asdVSnonAS12to18 <- comorbidityAnalysis( cases = femaleASD, 
                                        controls  = femaleNonASD, 
                                        minAge           = 12,
                                        maxAge           = 18,
                                        casesSymbol        = "ASD", 
                                        controlsSymbol     = "nonASD", 
                                        minimunDiagnosis = 3)

save( asdVSnonAS12to18, file = "females_asdVSnonASD12to18.RData" )

In [None]:
##########################################################
# Comorbidity analysis: Non-ASD females vs Non-ASD males #
#########################################################
femaleNonASD$Caco <- "female"
maleNonASD$Caco <- "male"

noASDmaleVsFemale0to2 <- comorbidityAnalysis( cases = femaleNonASD, 
                                        controls  = maleNonASD, 
                                        minAge           = 0,
                                        maxAge           = 2,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( noASDmaleVsFemale0to2, file = "nonASD_maleVsFemaleASD0to2.RData" )

noASDmaleVsFemale3to5 <- comorbidityAnalysis( cases = femaleNonASD, 
                                        controls  = maleNonASD, 
                                        minAge           = 3,
                                        maxAge           = 5,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( noASDmaleVsFemale3to5, file = "nonASD_maleVsFemaleASD3to5.RData" )


noASDmaleVsFemale6to11 <- comorbidityAnalysis( cases = femaleNonASD, 
                                        controls  = maleNonASD, 
                                        minAge           = 6,
                                        maxAge           = 11,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( noASDmaleVsFemale6to11, file = "nonASD_maleVsFemaleASD6to11.RData" )

noASDmaleVsFemale12to18 <- comorbidityAnalysis( cases = femaleNonASD, 
                                        controls  = maleNonASD, 
                                        minAge           = 12,
                                        maxAge           = 18,
                                        casesSymbol        = "female", 
                                        controlsSymbol     = "male", 
                                        minimunDiagnosis = 3)

save( noASDmaleVsFemale12to18, file = "nonASD_maleVsFemaleASD12to18.RData" )