In [1]:

%load_ext rpy2.ipython

In [2]:
%%R
getwd()

[1] "/usr/data/extTADA"


In [3]:
%%R
source("scripts/TADA/TADA.R")

#################################################################
# Application of TADA
#################################################################

# Model parameters: two categories of mutations - LoF and mis3 mutations ("probably damaging" by PolyPhen2)
mu.frac <- c(0.074, 0.32)
gamma.mean.dn <- c(20, 4.7)
beta.dn <- c(1,1)
gamma.mean.CC <- c(2.3, 1.00)
beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
pi0 <- 0.94 # the fraction of non-risk genes

# ASC (Autism Sequencing Consortium) data
# The file name contains the sample size information
# The only relevant counts are dn.LoF and dn.mis3
data <- read.csv("scripts/TADA/data/ASC_2231trios_1333trans_1601cases_5397controls.csv", header=TRUE, as.is=TRUE)
ntrio <- 2231  # number of trios
ncase <- 1601  # number of cases
nctrl <- 5397  # number of controls
ntrans <- 1333 # number of subjects with transmission data
N <- list(dn=ntrio, ca=ntrans+ncase, cn=ntrans+nctrl)



In [4]:
%%R
# Running TADA
counts <- as.array(cbind(data$dn.LoF, data$case.LoF+data$trans.LoF, data$ctrl.LoF+data$ntrans.LoF, data$dn.mis3, data$case.mis3+data$trans.mis3, data$ctrl.mis3+data$ntrans.mis3))
rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar)
data$BF <- rs$BF.total

# Estimating p-values of BFs (this is optional and slow)
#rsp <- TADAp(counts, N, data$mut.rate, mu.frac, hyperpar, l=100)
#data$pval.TADA <- rsp$pval

# FDR estimation
data <- data[order(-data$BF),]
data$qvalue <- Bayesian.FDR(data$BF, pi0)$FDR
write.csv(data, "data/TADA_results.csv", row.names=FALSE)




In [5]:
%%bash
cat data/TADA_results.csv|awk -F"," '$NF<0.1'|wc

     34      34    2640


In [11]:
!mkdir TestOutTADA

####Change gamma.mean.CC ( $\bar{\gamma}_{CC}$ )

In [16]:
%%R

seqGamma <- seq(4.2, 8, by = 0.2)

listGenes <- NULL

for (ii in 1:length(seqGamma)){
    mu.frac <- c(0.074, 0.32)
gamma.mean.dn <- c(20, 4.7)
    
beta.dn <- c(1,1)
#gamma.mean.CC <- c(2.3, 1.00)
    
    gamma.mean.CC <- c(as.numeric(seqGamma[ii]), 1.00)
beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
pi0 <- 0.94 # the fraction of non-risk genes
    
    data <- read.csv("scripts/TADA/data/ASC_2231trios_1333trans_1601cases_5397controls.csv", header=TRUE, as.is=TRUE)
ntrio <- 2231  # number of trios
ncase <- 1601  # number of cases
nctrl <- 5397  # number of controls
ntrans <- 1333 # number of subjects with transmission data
N <- list(dn=ntrio, ca=ntrans+ncase, cn=ntrans+nctrl)


# Running TADA
counts <- as.array(cbind(data$dn.LoF, data$case.LoF+data$trans.LoF, data$ctrl.LoF+data$ntrans.LoF, data$dn.mis3, data$case.mis3+data$trans.mis3, data$ctrl.mis3+data$ntrans.mis3))
rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar)
data$BF <- rs$BF.total

# Estimating p-values of BFs (this is optional and slow)
#rsp <- TADAp(counts, N, data$mut.rate, mu.frac, hyperpar, l=100)
#data$pval.TADA <- rsp$pval

# FDR estimation
data <- data[order(-data$BF),]
data$qvalue <- Bayesian.FDR(data$BF, pi0)$FDR
    
    listGenes[[ii]] <- data[data$qvalue < 0.1, 1]
    
    write.csv(data, 
          paste("TestOutTADA/TADA_results_gammaCC.", seqGamma[ii], ".csv", sep = ""), 
          row.names=FALSE)

    }




###Change gamma.mean.CC ($\bar{\gamma}_{CC}$) and gamma.mean.denovo ($\bar{\gamma}_{dn}$)

###

RR of the de novo mutations > RR of the inherited variants

RR of LoF mutations > RR of missense mutations


/Users/hoang/Documents/Packages/TADA/extTADA/TestOutTADA

In [6]:
%%R

seqGamma <- seq(0.5, 4, by = 0.2)
seqGammaDenovo <- seq(5, 25, by = 0.2)


listGenes <- NULL

for (ii in 1:length(seqGamma)){
    
    for (jj in 1:length(seqGammaDenovo)){
        
        
    mu.frac <- c(0.074, 0.32)

#        gamma.mean.dn <- c(20, 4.7)
        gamma.mean.dn <- c(as.numeric(seqGammaDenovo[jj]), 4.7)
    
beta.dn <- c(1,1)
#gamma.mean.CC <- c(2.3, 1.00)
    
    gamma.mean.CC <- c(as.numeric(seqGamma[ii]), 1.00)
beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
pi0 <- 0.94 # the fraction of non-risk genes
    
    data <- read.csv("scripts/TADA/data/ASC_2231trios_1333trans_1601cases_5397controls.csv", header=TRUE, as.is=TRUE)
ntrio <- 2231  # number of trios
ncase <- 1601  # number of cases
nctrl <- 5397  # number of controls
ntrans <- 1333 # number of subjects with transmission data
N <- list(dn=ntrio, ca=ntrans+ncase, cn=ntrans+nctrl)


# Running TADA
counts <- as.array(cbind(data$dn.LoF, data$case.LoF+data$trans.LoF, data$ctrl.LoF+data$ntrans.LoF, data$dn.mis3, data$case.mis3+data$trans.mis3, data$ctrl.mis3+data$ntrans.mis3))
rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar)
data$BF <- rs$BF.total

# Estimating p-values of BFs (this is optional and slow)
#rsp <- TADAp(counts, N, data$mut.rate, mu.frac, hyperpar, l=100)
#data$pval.TADA <- rsp$pval

# FDR estimation
data <- data[order(-data$BF),]
data$qvalue <- Bayesian.FDR(data$BF, pi0)$FDR
    
    listGenes[[ii]] <- data[data$qvalue < 0.1, 1]
    
    write.csv(data, 
          paste("TestOutTADA/TADA_results_gammaCC.", seqGamma[ii], 
                ".gammaDenovo.", seqGammaDenovo[jj],
                ".csv", sep = ""), 
          row.names=FALSE)

    }
    }




In [19]:
#!gist -p RunTADAwithDifferentParameters.ipynb

In [20]:
!gist -u https://gist.github.com/3d1b501c543ff67dda85 RunTADAwithDifferentParameters.ipynb

https://gist.github.com/3d1b501c543ff67dda85
