In [2]:
%load_ext rpy2.ipython

###Change gamma.CC

In [3]:
%%R
# ASC (Autism Sequencing Consortium) data
# The file name contains the sample size information
# The only relevant counts are dn.LoF and dn.mis3
#data <- read.csv("data/ASC_2231trios_1333trans_1601cases_5397controls.csv", header=TRUE, as.is=TRUE)
#ntrio <- 2231  # number of trios
#ncase <- 1601  # number of cases
#nctrl <- 5397  # number of controls
#ntrans <- 1333 # number of subjects with transmission data
#N <- list(dn=ntrio, ca=ntrans+ncase, cn=ntrans+nctrl)

data <- read.csv("data/nature_aut_id_epi.merged.txt", header = TRUE, sep = " ")
#source("TADA/TADA.R")
ntrio <- 2270 + 32 + 0 + 2508 ##AUT
ntrio <- ntrio + 0 + 356 #add EPI
ntrio <- ntrio + 100 + 0 + 41 + 51 ##add ID samples
ncase <- 1601 #Only from AUT paper
nctrl <- 5397  #Only from AUT paper
ntrans <- 1298 ##Check this information, Only use transmitted information from AUT paper nature13772
#ntrans <- ntrio

N <- list(dn=ntrio, ca=ntrans+ncase, cn=ntrans+nctrl)


print(head(data))

sum(unlist(N))

    Gene mut.rate dn.LoF case.LoF ctrl.LoF trans.LoF ntrans.LoF dn.mis3
1   A1BG 2.36e-05      0        0        1         0          0       0
2   A1CF 2.17e-05      0        1        2         1          1       0
3  A2LD1 6.09e-06      0        0        0         0          0       0
4    A2M 5.64e-05      0        1        5         1          0       1
5  A2ML1 5.70e-05      0        3       16         2          6       1
6 A4GALT 2.42e-05      0        0        2         1          0       0
  case.mis3 ctrl.mis3 trans.mis3 ntrans.mis3     BF.dn         BF qvalue.dn
1         0         4          7           3 0.7641762 0.54953615 0.9308429
2         0         7          3           7 0.7799631 0.75499580 0.9294060
3         0         0          0           0 0.9322423 0.50824558 0.8592336
4         4        22          6           6 0.5305202 0.29674445 0.9388012
5         5        30          6          10 0.5270516 0.05155223 0.9388542
6         1         5          2        

In [4]:
%%R

source("scripts/TADA/TADA.R")

#################################################################
# Application of TADA
#################################################################

# Model parameters: two categories of mutations - LoF and mis3 mutations ("probably damaging" by PolyPhen2)
mu.frac <- c(0.074, 0.32) ##Mutation rates for specific category
#gamma.mean.dn <- c(20, 4.7)
gamma.mean.dn <- c(14, 2.33)
#beta.dn <- c(1,1)
beta.dn <- c(4.8, 2.325)


gamma.mean.CC <- c(2.3, 1.00)

#gamma.mean.CC <- c(2.0, 1.00)


beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
#pi0 <- 0.94 # the fraction of non-risk genes
pi0 <- 1 - 0.07419

hyperpar

               [,1]     [,2]
gamma.mean.dn  14.0    2.330
beta.dn         4.8    2.325
gamma.mean.CC   2.3    1.000
beta.CC         4.0 1000.000
rho1            0.1    0.500
nu1           200.0  100.000
rho0            0.1    0.500
nu0           200.0  100.000


In [7]:
%%R

# Running TADA
counts <- as.array(cbind(data$dn.LoF, ##Xd
                         data$case.LoF+data$trans.LoF, data$ctrl.LoF+data$ntrans.LoF, ##X1, X0
                         data$dn.mis3, 
                         data$case.mis3+data$trans.mis3, data$ctrl.mis3+data$ntrans.mis3))
head(counts)

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    0    0    1    0    7    7
[2,]    0    2    3    0    3   14
[3,]    0    0    0    0    0    0
[4,]    0    2    5    1   10   28
[5,]    0    5   22    1   11   40
[6,]    0    1    2    0    3    6


In [24]:
%%R

seqGamma <- seq(2, 2.5, by = 0.1)

listGenes <- NULL

for (ii in 1:length(seqGamma)){
    
    mu.frac <- c(0.074, 0.32) ##Mutation rates for specific category
gamma.mean.dn <- c(20, 4.7)

    #gamma.mean.dn <- c(14, 2.33)

    #beta.dn <- c(1,1)
beta.dn <- c(4.8, 2.325)
    
    gamma.mean.CC <- c(as.numeric(seqGamma[ii]), 1.00)

#gamma.mean.CC <- c(2.0, 1.00)


beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
#pi0 <- 0.94 # the fraction of non-risk genes
pi0 <- 1 - 0.07419

hyperpar
    
    
    
    print(hyperpar)
    rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar)
    data$BF <- rs$BF.total
    
    
# Estimating p-values of BFs (this is optional and slow)
#rsp <- TADAp(counts, N, data$mut.rate, mu.frac, hyperpar, l=100)
#data$pval.TADA <- rsp$pval

# FDR estimation

    data <- data[order(-data$BF),]

    data$qvalue <- Bayesian.FDR(data$BF, pi0)$FDR

    listGenes[[ii]] <- data[data$qvalue < 0.1, 1]
    
    write.csv(data, 
          paste("data/OutTestGammaCC/ext_TADA_results_gammaCC.", seqGamma[ii], ".csv", sep = ""), 
          row.names=FALSE)

    }



               [,1]     [,2]
gamma.mean.dn  20.0    4.700
beta.dn         4.8    2.325
gamma.mean.CC   2.0    1.000
beta.CC         4.0 1000.000
rho1            0.1    0.500
nu1           200.0  100.000
rho0            0.1    0.500
nu0           200.0  100.000
               [,1]     [,2]
gamma.mean.dn  20.0    4.700
beta.dn         4.8    2.325
gamma.mean.CC   2.1    1.000
beta.CC         4.0 1000.000
rho1            0.1    0.500
nu1           200.0  100.000
rho0            0.1    0.500
nu0           200.0  100.000
               [,1]     [,2]
gamma.mean.dn  20.0    4.700
beta.dn         4.8    2.325
gamma.mean.CC   2.2    1.000
beta.CC         4.0 1000.000
rho1            0.1    0.500
nu1           200.0  100.000
rho0            0.1    0.500
nu0           200.0  100.000
               [,1]     [,2]
gamma.mean.dn  20.0    4.700
beta.dn         4.8    2.325
gamma.mean.CC   2.3    1.000
beta.CC         4.0 1000.000
rho1            0.1    0.500
nu1           200.0  100.000
rho0          

In [23]:
%%R
dataOut <- data[data$qvalue < 0.1,]
head(dataOut)

       Gene    mut.rate dn.LoF case.LoF ctrl.LoF trans.LoF ntrans.LoF dn.mis3
7610  HYDIN 0.000212595      0        1        3         0          0       0
4462   DENR 0.000004540      0        0        0         0          0       1
11971  PER1 0.000064200      0        3        3         0          4       0
17978 ZBED5 0.000027200      0        0        0         0          0       0
13312 RAMP2 0.000005040      0        0        0         1          2       0
4521  DHRS9 0.000011900      0        3        6         1          0       0
      case.mis3 ctrl.mis3 trans.mis3 ntrans.mis3     BF.dn           BF
7610          2        11          2           2 0.1076329 8.812003e+15
4462          0         1          0           0 4.4315745 7.505474e+09
11971        10        59         12          10 0.4872958 6.865462e+09
17978         0         0          0           0 0.7333870 2.304832e+08
13312         0         0          0           2 0.9435590 3.273862e+07
4521          3       

In [13]:
%%R
seqGamma
ii

[1] 23


###Change pi

In [19]:
%%R

seqPi <- seq(0.01, 0.2, by = 0.01)

listGenes <- NULL
rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar)
data$BF <- rs$BF.total


for (ii in 1:length(seqPi)){
    
    
    
    
# Estimating p-values of BFs (this is optional and slow)
#rsp <- TADAp(counts, N, data$mut.rate, mu.frac, hyperpar, l=100)
#data$pval.TADA <- rsp$pval

# FDR estimation

    data <- data[order(-data$BF),]
    
    piTemp <- seqPi[ii]

    data$qvalue <- Bayesian.FDR(data$BF, 1 - piTemp)$FDR

    listGenes[[ii]] <- data[data$qvalue < 0.1, 1]
    
    write.csv(data, 
          paste("data/OutTestGammaCC/ext_TADA_results_pi.", piTemp, ".csv", sep = ""), 
          row.names=FALSE)

    }



In [11]:
%%R
listGenes

[[1]]
  [1] AGPAT9   NCCRP1   ERBB2IP  KLF11    PHF8     SGCB     ONECUT3  GABRA5  
  [9] C2orf40  MGAT3    KIR2DS4  KLHL10   GSR      MZB1     SRMS     OR3A2   
 [17] ZBTB47   RPS20    ZFPL1    A1BG     CEP290   TMX4     NEUROG1  TNFSF15 
 [25] MAP2K7   TAS2R19  NXPH4    EVC2     RNF38    CCDC151  LYST     HIPK1   
 [33] NUDT16   PJA2     ARL3     GATAD2A  NNMT     ZNF616   SMC6     DMBX1   
 [41] FNDC7    MAB21L1  CPEB1    ANO2     GALNTL1  NCAPD2   GNG5     SPANXN3 
 [49] FLJ42280 NDUFB2   C12orf12 PLAG1    RP2      ZCWPW2   TUBB1    AEBP2   
 [57] TMEM192  RGR      PRPF19   BTBD9    DGCR2    PTGER4   SMC3     OR1L8   
 [65] ZNF354A  CCNB1IP1 FAM170B  OR1F1    C16orf72 GPR89A   TMEM120A C2orf56 
 [73] MRS2     WSB2     OR11H12  C14orf49 C8A      XCL1     KIF9     TNFRSF1B
 [81] PEMT     TTC37    SMARCB1  TMEM114  RAX2     GLUD1    GRK4     MDM1    
 [89] FAM210A  SERPINE1 OR51F2   BHLHA15  ZNF254   C11orf10 IQCF6    LRRC66  
 [97] UBXN10   E2F6     LPAR6    ZC3H7A   PFKM     MRPL10 

In [132]:
%%R
tempData <- data.frame(data$BF, data$pval.TADA)

tempData <- tempData[tempData[, 1] < quantile(tempData[, 1], 0.99), ]

plot(tempData[, 1], tempData[, 2], xlab = "BF", ylab = "p")

Error in data.frame(data$BF, data$pval.TADA) : 
  arguments imply differing number of rows: 18735, 0


###Extract genes whose FDRs < 0.1

In [133]:
%%bash
cat data/ext_TADA_results.csv |awk -F"," '$(NF-1)<0.1'|\
    awk -F"," '{print $1}'|sed 's/"//g'|grep -v "Gene" |sort > data/list.genesWithNewPiFDR01.txt
    
cat data/list.genesWithNewPiFDR01.txt|wc -l
cat data/list.genesWithNewPiFDR01.txt|tr "\n" " "

32
ADNP ANK2 APH1A ARID1B ASH1L ASXL3 BCL11A BIRC6 CACNA2D3 CHD8 CUL3 DPP3 DYRK1A GABRB3 GALNTL4 GRIN2B KATNAL2 KIAA0182 MLL3 MYO9B NAA15 NR3C2 POGZ PTEN PYHIN1 RELN SCN2A SUV420H1 SYNGAP1 TBR1 TGM1 TRIO 

###Intersect this gene set with the 33 genes in AUT

In [134]:
%%bash
join -1 1 -2 1 data/list.genesWithNewPiFDR01.txt data/list.33genes.FDR0.1.txt|wc -l

26


####Test with known risk genes from three diseases: ASD + EPI + ID

In [94]:
%%R
#################################################################
# Application of TADA
#################################################################

# Model parameters: two categories of mutations - LoF and mis3 mutations ("probably damaging" by PolyPhen2)
mu.frac <- c(0.074, 0.32) ##Mutation rates for specific category
#gamma.mean.dn <- c(20, 4.7)
gamma.mean.dn <- c(14, 2.33)
#beta.dn <- c(1,1)
beta.dn <- c(4.8, 2.325)


gamma.mean.CC <- c(2.3, 1.00)
beta.CC <- c(4.0, 1000)
rho1 <- c(0.1, 0.5)
nu1 <- c(200, 100)
rho0 <- c(0.1, 0.5)
nu0 <- c(200, 100)
hyperpar1 <- as.array(rbind(gamma.mean.dn, beta.dn, gamma.mean.CC, beta.CC, rho1, nu1, rho0, nu0))
l <- 100
#pi0 <- 0.94 # the fraction of non-risk genes
pi0 <- 1 - 0.07419


In [100]:
%%R
#Running TADA
rs <- TADA(counts, N, data$mut.rate, mu.frac, hyperpar1)
data$BF <- rs$BF.total
# Estimating p-values of BFs (this is optional and slow)

#rsp1 <- TADAp(counts, N, data1$mut.rate, mu.frac, hyperpar1, l=100)
#data1$pval.TADA <- rsp1$pval




In [101]:
%%R
# FDR estimation
data <- data[order(-data$BF),]
data$qvalue <- Bayesian.FDR(data$BF, pi0)$FDR



In [102]:
%%R
write.csv(data1, "data/ext_TADA_results_usingThreeGeneSets.csv", row.names=FALSE)

getwd()

[1] "/usr/data/extTADA"


In [103]:
%%bash
cat data/ext_TADA_results_usingThreeGeneSets.csv |awk -F"," '$(NF-1)<0.1'|\
    awk -F"," '{print $1}'|sed 's/"//g'|grep -v "Gene" |\
    sort > data/list.genesWithNewPiFDR01.usingThreeGeneSets.txt
    
cat data/list.genesWithNewPiFDR01.usingThreeGeneSets.txt|wc -l
cat data/list.genesWithNewPiFDR01.usingThreeGeneSets.txt|tr "\n" " "


105
ACTL6B AIFM3 ARHGEF10L ARL5C ASS1 ATAD2 BAMBI BEND3 BPIFB3 BRIX1 C12orf68 C1orf27 C20orf197 C21orf33 C2CD4A C2orf53 C3orf26 C3orf79 C6orf72 CCL18 CD27 CD300A CD9 CLRN3 CLTCL1 CYP4F8 DLG2 DUSP28 DYNC1LI2 EDARADD EIF2S1 EIF4E1B ESPNL EXOC7 F2R FAM178B FAM26F FAM55B FGFR1 GALNT2 GGTLC2 GJD2 GSTT1 GUCY2C HINT1 HIST1H2AJ HLA-DPB1 HN1L HSPA1A IHH KATNAL2 KCNK2 KCNN1 KIAA0226 KIF14 KRTAP20-2 KRTAP21-1 LITAF LRTOMT MAN2B2 METTL20 MGP MKL1 MPPED2 MRC1 MRPL2 NDUFA3 NIPSNAP3A NKX6-2 NTPCR OGDHL OR10A2 OR2T1 OVCH1 PCDHAC1 PDHA2 PHOX2B PLA2G4A PLA2G4C PNMA1 PRKAG3 RABGAP1L RDH10 RHOG RNASEH2A SBDS SEMA4B SLC30A2 SLC9C2 SMPX SPDYE2 SRP19 SUZ12 TCF23 TMEM126B TMEM176B TNFRSF12A TNFRSF6B TSPYL5 UNC5B VAX1 WBSCR22 ZDHHC19 ZNF714 ZXDC 

In [104]:
%%bash
join -1 1 -2 1 data/list.genesWithNewPiFDR01.usingThreeGeneSets.txt data/list.33genes.FDR0.1.txt|wc -l

1


In [25]:
#!gist -p Re_run_TADA_exTADA_testParameters.ipynb

https://gist.github.com/7eeb2d36815d0b40922f


In [26]:
!gist -u https://gist.github.com/7eeb2d36815d0b40922f Re_run_TADA_exTADA_testParameters.ipynb

https://gist.github.com/7eeb2d36815d0b40922f
