Merge branch 'master' of github.com:kylec/mutation-analysis

kylec · Dec 16, 2014 · 86a9336 · 86a9336
2 parents 06dc61d + 7518c48
commit 86a9336
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -17,6 +17,15 @@ cut -f1,7 ../samples.txt  | while read sample bam; do echo $sample $bam; vcf=`ec
 
 ## Somatic mutation 
 ### Mutect
+### Varscan
+```shell
+#filter output by design
+BEDFILE=~/references/IAD59895_182_Designed.bed
+for VCF in *.snp.vcf; do
+  OUTFILE=`basename $VCF | sed 's/.vcf/.tgt.vcf/'`
+  bedtools intersect -a $VCF -b $BEDFILE > $OUTFILE
+done
+```
 ## Germline mutation
 ### GATK
 ## Copynumber
@@ -37,17 +46,36 @@ for a in ../bam/*.bam; do echo $a; sample=`basename $a | cut -d. -f1`;  bedtools
 
 ## RNAseq 
 ### Tuxedo Protocol
-
+```shell
+mkdir sourcedata sample_groups thout cqout clout cdout
+#count fasta
+a_path=/rsrch1/epi/scheet/PROJECTS/Vilar_Lynch/Project_EBF_LynchSyn_RNA48; b_path=.; for b in `ls $b_path`; do a_count=`ls $a_path/$b | wc -l`; b_count=`ls $b | wc -l`; echo -e "$b\t$a_count\t$b_count"; done
+```
 ## Ampliseq
 ```shell
-# mutations
+# torrent mutations
 cat ../pairs.txt | while read pat tum nrm; do echo $tum $nrm; bedtools intersect -a $tum.vcf -b $nrm.vcf -v -header > $tum.somatic.vcf; done
+for a in `ls *somatic.vcf` ; do echo $a;  sample=`echo $a | cut -d. -f1`; echo "python ~/mutation-analysis/adjustAf2vcf.py -i $a -b ../bam/$sample.bam > $sample.somatic.adjaf.vcf"; done
+# mutect mutations
+BAMDIR=../../../0012-merge-and-filter/output
+for PAT in `ls *.pass.vcf | cut -d. -f1`; do 
+  echo $PAT; 
+	python ~/mutation-analysis/adjustAf2vcf.py -i $PAT.mutect.pass.vcf -b $BAMDIR/$PAT.bam > $PAT.mutect.pass.adjaf.vcf
+done
+# mutect vtools
+for a in `ls ~/fap_ampliseq_syqada/mutect-downsample-200/0001-mutect/output/*.adjaf.vcf`; do echo $a; tum=`basename $a | cut -d. -f1`; nrm=`grep $a ../pairs2.txt | cut -f3`; echo vtools import --build hg19 --format control/mutect_fap_vcf.fmt $a --sample_name $tum $nrm; done
+less control/fap_torrent_report.header | vtools export variant --format control/fap_mutect_report.fmt --header - --output mutect.report --samples 'sample_name like "%-P"'
+
+#vtools
+cut -f1,2,3,5 ../samples.txt | while read sample id pat type; do vcf=`ls ../torrent_somatic_low_stringency/*.somatic.adjaf.vcf | grep $sample`; if [ "$vcf" != "" ]; then vtools import --build hg19 --format control/torrent_fap_vcf.fmt $vcf --sample_name $pat-$id-$type; fi; done
+less control/fap_torrent_report.header | vtools export variant --format control/fap_torrent_report.fmt --header - --output torrent.report --samples 'sample_name like "%EB%"'
+python ~/mutation-analysis/addAf2Report.py -i torrent.report   > torrent.report.coding
+awk -F"\t" '$71>0 || $1=="chr"' torrent.report.coding  > torrent.report.coding.afcutoff
+
 # replace sample name (not needed if you do vtools import by one sample, can be specified in vtools param)
 cut -f1,2 ../samples.txt | while read curr prev; do echo $curr $prev; awk -v var1=$curr -v var2=$prev '{FS=OFS="\t"; if($1 ~ /#CHROM/){gsub(var2, var1, $10)}; print $0}' $curr.vcf > tmp; mv tmp $curr.vcf; done
-#  import
+# import
 cut -f1,2,3,5 ../samples.txt | while read sample id pat type; do vcf=`ls ../torrent/*.somatic.vcf | grep $sample`; if [ "$vcf" != "" ]; then vtools import --build hg19 --format control/torrent_fap_vcf.fmt $vcf --sample_name $pat-$id-$type; fi; done
-# export vtools report
-less control/fap_torrent_report.header | vtools export variant --format control/fap_torrent_report.fmt --header - --output torrent.report --samples 'sample_name like "%polyp"'
 # export novel 1% report
 vtools select variant "(thousandGenomesEBI.EUR_AF_INFO is null or thousandGenomesEBI.EUR_AF_INFO < 0.01) and (evs.EuropeanAmericanMaf is null or 1.0*evs.EuropeanAmericanAltCount/(evs.EuropeanAmericanAltCount+evs.EuropeanAmericanRefCount) < 0.01)" -t variant_novel_01
 less control/fap_torrent_report.header | vtools export variant_novel_01 --format control/fap_torrent_report.fmt --header - --output torrent_novel_01.report --samples 'sample_name like "%polyp"'

diff --git a/chat.R b/chat.R
@@ -106,6 +106,7 @@ for (i in 1:length(samples$V2)) {
 
 
 #save segmentation data as .rdata
+print("save segmentation data")
 seg.data=paste0(sampleid,".Rdata")
 seg_sAGP.data=paste0(sampleid,"_sAGP.Rdata")
 agp.txt = paste0(sampleid, "_AGP.txt")
@@ -116,6 +117,7 @@ mode(dd.dat)='numeric'
 save(dd.dat, file=seg.data)
 
 #AGP inference
+print("AGP inference")
 para <- getPara()
 para$datafile <- seg.data 
 para$thr.penalty <- 300
@@ -130,25 +132,80 @@ para.s$savedata <- seg_sAGP.data
 getsAGP(para=para.s)
 
 #CCF
+print("get ccf")
 load(seg_sAGP.data)
 getCCF('vcf/',sAGPfile=seg_sAGP.data,nt=FALSE,output="Test.vcf",TCGA=FALSE,nc=10,tc=11,AD=2,filter=FALSE)
 
-#fit
+## @knitr plot1
+# fit clusters
+library(KernSmooth)
+library(Ckmeans.1d.dp)
+library(CHAT)
+
 # read vcf
+setwd("~/Analysis/fap/chat")
 vcf = read.table("Test.vcf", sep="\t")
+
+# read vtools report (downloaded from hap0)
+# TODO: make new one on hpc
+report = read.table("fap_mutect.report", header=T, sep="\t", quote="", na.strings="NA", fill=T)
+annot=report[,c("chr", "hg19_pos", "ref", "alt", "id", "region_type", "region_name")]
+
+# merge annotation and vcf
+vcf = merge(vcf, annot, by.x=c("V1","V2","V4","V5"), by.y=c("chr", "hg19_pos", "ref", "alt"))
+
+# extract ccf column
 info = strsplit(as.character(vcf[,8]), ";")
 dat = as.data.frame(matrix(unlist(strsplit(as.character(vcf[,8]), ";")), nrow=nrow(vcf), byrow=T))
 
+# gene list
+vogelstein = system("cat ~/Analysis/fap/crc_genes/vogelstein.txt", intern=T)
+
+# dpfit priors
 data(mcmc)
 data(prior)
-# each sample use dpfit
-result = NULL
+
+# 1 dimension k means cluster s
+#TODO determine cluster with local minima instead of visual inspection
+numClust = c(2,1,2,1,2,1,1,2,2,1,3,1,1,2,2,2,2)
 for (i in unique(dat[,1])) {
-  i = "Vilar14"
+  # index for cluster
+  index = 1 
+  #i = "Vilar14"
+  snv= vcf[which(dat$V1==i & dat$V6!="NA"),]
+
   # sample and ccf not null
+  d=dat[which(dat$V1==i & dat$V6!="NA"),]
   ccf = as.numeric(as.character(dat[which(dat$V1==i & dat$V6!="NA"),]$V6))
+
+  # genes label for plots
+  gene_name = as.character(snv[which(snv$region_type=="exonic"),]$region_name)
+  gene_y = ccf[which(snv$region_type=="exonic")]
+  gene_x = which(snv$region_type=="exonic")  
+
   # count snv
-  print(paste0(i, "=", length(ccf)))
+  print(paste0(i, " number of SNVs = ", length(ccf)))
+
+  # kernel density
+  #par(mfrow=c(2,1))
+  result <- bkde(x=ccf)
+  plot(result, xlab="Cancer Cell Fraction (CCF)", ylab="Density function", main=i)
+
+  result1 = Ckmeans.1d.dp(ccf, numClust[index])
+  plot(ccf, col = result1$cluster, xlab= "SNV", ylab="Cancer Cell Fraction (CCF)", main = "SNV Clustering")
+  text(gene_x, gene_y, gene_name)
+  index = index + 1
+
+
+  print(gene_name)
+  print("")
+  print("")
+  #TODO: doesn't always work when finding minimum...
+  #v=optimize(approxfun(result$x,result$y),interval=c(0,1))$minimum
+  #abline(v=v, col="blue")
+
+  # dpfit
+  if (FALSE) {
   fit = getDPfit(ccf, alpha = 0.05, low.thr = 0.05,prior,mcmc)
   numClones = 0
   if (fit$model == 2) {
@@ -157,13 +214,14 @@ for (i in unique(dat[,1])) {
     numClones = 1
   }
   result = rbind(result, c(i, numClones))
+  }
 }
 write.table(result, "clones.txt", sep="\t", quote=FALSE, row.names=FALSE)
-?write.table
 
 
-### TEST
 
+##### TEST #####
+if (FALSE) {
 fit = kmeans(ccf, 5)
 library(cluster) 
 clusplot(ccf, fit$cluster, color=TRUE, shade=TRUE, 
@@ -426,4 +484,5 @@ for(k in 1:n){
     tmp.dd<-tmp.vcf[vv,]
     tmp.dd[,8]<-INFO
     MutInfo<-rbind(MutInfo,tmp.dd)
+}
 }
diff --git a/plots/allele_fraction.R b/plots/allele_fraction.R
@@ -2,7 +2,11 @@
 
 library(ggplot2)
 library(reshape2)
+library(grid)
+library(gridExtra)
 
+
+setwd("/Users/kchang3/Analysis/fap/expands/snv-only")
 # count loh after filtering
 samples = system("ls *.ann.tsv | cut -d. -f1", inter=T)
 for (i in 1:length(samples)) {
@@ -22,32 +26,66 @@ for (i in 1:length(samples)) {
   }
 }
 
+genes=c("KRAS", "APC", "GNAS", "TP53", "AKT1", "SOX9", "ARID1A", "CNOT3", "CDC27", "FBXW7", "TCF7L2", "GNAQ")
 # plot AF density/count
-plot_list = c()
+plot_list = NULL
 for (i in 1:length(samples)) {
   basename = samples[i]
-  a = read.table(paste(basename,"ann.tsv",sep="."), header=T, sep="\t")
+  a = read.table(paste0(basename,".sps.ann.tsv"), header=T, sep="\t", stringsAsFactors=F)
   head(a)
-  b = a[which(a$PN_B==0 & a$f!="NA" & a$f <= 1),]
-  b=b[grep("exonic", b$region_type),]
+
+
+  # allele frequency
+  b = a
+  b = b[grep("exonic", b$region_type),]
+  # keep important gene names and plot them 
+  b$region_name[b$region_name %in% genes == FALSE] = ""
+  # plot gene names with AF_Tumor (allele fraction)
   p = ggplot(b) +
-  geom_density(aes(x=AF_Tumor, fill="AF"), alpha=.3) + 
-  geom_density(aes(x=f, fill="Adj. AF"), alpha=.3) + 
-  #geom_histogram(aes(x=AF_Tumor), binwidth=.01, fill="blue", alpha=.2) +
-  #geom_histogram(aes(x=f), binwidth=.01, fill="red", alpha=.2) +
-  xlab("Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14))
-  plot_list[[i]] = p
+    geom_histogram(aes(x=AF_Tumor), binwidth=.01, fill="blue", alpha=.2) +
+    geom_density(aes(x=AF_Tumor), alpha=.2) + 
+    scale_x_continuous(limits=c(0,1)) +
+    scale_y_continuous(limits=c(0,15)) + 
+    xlab("Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14)) + 
+    geom_text(aes(x=AF_Tumor, y=1, label=region_name, angle=90))
+  p
+
+  # absolute AF
+  b = a
+  b = b[which(b$PN_B==0 & b$f!="NA" & a$f <= 1),]
+  b = b[grep("exonic", b$region_type),]
+  b$region_name[b$region_name %in% genes == FALSE] = ""
+
+  # number of clones
+  sp = length(sort(unique(a$SP[!is.na(a$SP)])))
+
+  p1 = ggplot(b) +
+    geom_histogram(aes(x=f), binwidth=.01, fill="red", alpha=.2) +
+    geom_density(aes(x=f), alpha=.2) + 
+    scale_x_continuous(limits=c(0,1)) +
+    scale_y_continuous(limits=c(0,15)) + 
+    xlab("Absolute Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14)) + 
+    geom_text(aes(x=f, y=1, label=region_name, angle=90))
+  p1
+
+  #g = arrangeGrob(p)
+  g = arrangeGrob(p,p1)
+  grid.arrange(g, ncol=1)
+  # save image
+  ggsave(paste0(basename, ".af.png"), g, width=10, height=15)
+
+  #plot_list[[i]] = p
   # using melt
   #c=cbind(b$AF_Tumor,b$f)
   #colnames(c)=c('a','b')
   #head(c)
   #d=melt(c)
   #ggplot(d, aes(x=value, fill=Var2)) + geom_density(alpha = .3)
 }
-library(grid)
-library(gridExtra)
+
+# plot all together
 names(plot_list) = samples
 png("test2.png", width=2000, height=2000)
 arg_list <- c(plot_list, list(nrow=4, ncol=4))
 do.call(grid.arrange, arg_list)
-dev.off()
+dev.off()
diff --git a/plots/fap_clones_af.R b/plots/fap_clones_af.R
@@ -1,6 +1,6 @@
 # read list of sample groupings
 #library(ggplot2)
-f = system("cat /Users/kyle_air/Dropbox/lab_vilar/fap/samples_group.txt", intern=T)
+f = system("cat ~/Dropbox/lab_vilar/fap/samples_group.txt", intern=T)
 
 for (i in 1:length(f)) {
 
@@ -18,7 +18,7 @@ for (i in 1:length(f)) {
   par(mfrow=c(1,length(samples)))
   for (i in 1:length(samples)) {
     file_name = strsplit(samples[i], "_")[[1]][1]
-    fin = paste(file_name,"ann.tsv",sep=".")
+    fin = paste(file_name,"sps.ann.tsv",sep=".")
     a = read.table(fin, sep="\t", header=T, stringsAsFactors=F)
 
     sp = sort(unique(a$SP[!is.na(a$SP)]))

diff --git a/plots/fap_gene_expression.R b/plots/fap_gene_expression.R
@@ -21,22 +21,22 @@ plot_heatmap = function(path, heatmap_name) {
   # csVolcano(genes(t_vs_n_cuff_data),'colon_polyp','colon_normal', alpha=0.05, showSignificant=T)
 
   # http://seqanswers.com/forums/showthread.php?t=19278
-  # stringent
-  sigGeneIds <- getSig(t_vs_n_cuff_data, level="genes", alpha=0.005)
+  # stringent filter for sig de genes id
+  sigGeneIds <- getSig(t_vs_n_cuff_data, level="genes", alpha=0.05)
 
   # return gene object of a cuffset, then return diff gene dataframe - value1 and value2 are average FPKM in genes.read_group_tracking
   t_vs_n_diff_genes <- diffData(genes(t_vs_n_cuff_data),features=TRUE)
-
+  # anthony filter for sig de genes id
   t_vs_n_diff_genes_sig_ids <- t_vs_n_diff_genes[t_vs_n_diff_genes$q_value <= 0.05 & abs(t_vs_n_diff_genes$log2_fold_change) >= 1,]$gene_id
-  # cuffgeneset
-  t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, t_vs_n_diff_genes_sig_ids)
-  #csHeatmap(t_vs_n_de_genes,cluster='both', replicates=T)
 
   # write de gene file by 1) getSig, 2) anthony's filter
   write.table(t_vs_n_diff_genes[ t_vs_n_diff_genes$gene_id %in% sigGeneIds, ], file="de_gene_getSig.tsv", quote=FALSE, row.names=FALSE, col.names=TRUE, sep="\t")
   write.table(t_vs_n_diff_genes[ t_vs_n_diff_genes$gene_id %in% t_vs_n_diff_genes_sig_ids, ], file="de_gene_anthony.tsv", quote=FALSE, row.names=FALSE, col.names=TRUE, sep="\t")
 
   # returns "internal_scaled_frags" in file genes.read_group_tracking - Estimated number of fragments originating from the object, after transforming to the internal common count scale (for comparison between replicates of this condition.)
+  # cuffgeneset using anthony's filter for de genes
+  t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, t_vs_n_diff_genes_sig_ids)
+  #csHeatmap(t_vs_n_de_genes,cluster='both', replicates=T)
   t_matrix <- repCountMatrix(t_vs_n_de_genes)
 
   # cluster fragments count
@@ -64,7 +64,7 @@ plot_heatmap = function(path, heatmap_name) {
   )
   dev.off()
 
-  # getSig genes clustering
+  # getSig genes clustering using stringent alpha cutoff
   t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, sigGeneIds)
   t_matrix <- repCountMatrix(t_vs_n_de_genes)
   # cluster fragments count
@@ -103,6 +103,11 @@ path="~/Analysis/fap/rnaseq/cdout/duodenum_polyp,duodenum_normal/"
 heatmap_name="duodenum_polyp_normal_heatmap"
 plot_heatmap(path, heatmap_name)
 
+### lynch
+path="~/Analysis/lynch/rnaseq-human/cdout/colon_polyp,colon_normal/"
+heatmap_name="colon_polyp_normal_heatmap"
+plot_heatmap(path, heatmap_name)
+
 ### polyp vs normal
 #TODO: filter polyp matrix for colon and duodenum gene signature