Skip to content

Commit

Permalink
Merge branch 'master' of github.com:kylec/mutation-analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
kylec committed Dec 16, 2014
2 parents 06dc61d + 7518c48 commit 86a9336
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 34 deletions.
38 changes: 33 additions & 5 deletions README.md
Expand Up @@ -17,6 +17,15 @@ cut -f1,7 ../samples.txt | while read sample bam; do echo $sample $bam; vcf=`ec

## Somatic mutation
### Mutect
### Varscan
```shell
#filter output by design
BEDFILE=~/references/IAD59895_182_Designed.bed
for VCF in *.snp.vcf; do
OUTFILE=`basename $VCF | sed 's/.vcf/.tgt.vcf/'`
bedtools intersect -a $VCF -b $BEDFILE > $OUTFILE
done
```
## Germline mutation
### GATK
## Copynumber
Expand All @@ -37,17 +46,36 @@ for a in ../bam/*.bam; do echo $a; sample=`basename $a | cut -d. -f1`; bedtools

## RNAseq
### Tuxedo Protocol

```shell
mkdir sourcedata sample_groups thout cqout clout cdout
#count fasta
a_path=/rsrch1/epi/scheet/PROJECTS/Vilar_Lynch/Project_EBF_LynchSyn_RNA48; b_path=.; for b in `ls $b_path`; do a_count=`ls $a_path/$b | wc -l`; b_count=`ls $b | wc -l`; echo -e "$b\t$a_count\t$b_count"; done
```
## Ampliseq
```shell
# mutations
# torrent mutations
cat ../pairs.txt | while read pat tum nrm; do echo $tum $nrm; bedtools intersect -a $tum.vcf -b $nrm.vcf -v -header > $tum.somatic.vcf; done
for a in `ls *somatic.vcf` ; do echo $a; sample=`echo $a | cut -d. -f1`; echo "python ~/mutation-analysis/adjustAf2vcf.py -i $a -b ../bam/$sample.bam > $sample.somatic.adjaf.vcf"; done
# mutect mutations
BAMDIR=../../../0012-merge-and-filter/output
for PAT in `ls *.pass.vcf | cut -d. -f1`; do
echo $PAT;
python ~/mutation-analysis/adjustAf2vcf.py -i $PAT.mutect.pass.vcf -b $BAMDIR/$PAT.bam > $PAT.mutect.pass.adjaf.vcf
done
# mutect vtools
for a in `ls ~/fap_ampliseq_syqada/mutect-downsample-200/0001-mutect/output/*.adjaf.vcf`; do echo $a; tum=`basename $a | cut -d. -f1`; nrm=`grep $a ../pairs2.txt | cut -f3`; echo vtools import --build hg19 --format control/mutect_fap_vcf.fmt $a --sample_name $tum $nrm; done
less control/fap_torrent_report.header | vtools export variant --format control/fap_mutect_report.fmt --header - --output mutect.report --samples 'sample_name like "%-P"'

#vtools
cut -f1,2,3,5 ../samples.txt | while read sample id pat type; do vcf=`ls ../torrent_somatic_low_stringency/*.somatic.adjaf.vcf | grep $sample`; if [ "$vcf" != "" ]; then vtools import --build hg19 --format control/torrent_fap_vcf.fmt $vcf --sample_name $pat-$id-$type; fi; done
less control/fap_torrent_report.header | vtools export variant --format control/fap_torrent_report.fmt --header - --output torrent.report --samples 'sample_name like "%EB%"'
python ~/mutation-analysis/addAf2Report.py -i torrent.report > torrent.report.coding
awk -F"\t" '$71>0 || $1=="chr"' torrent.report.coding > torrent.report.coding.afcutoff

# replace sample name (not needed if you do vtools import by one sample, can be specified in vtools param)
cut -f1,2 ../samples.txt | while read curr prev; do echo $curr $prev; awk -v var1=$curr -v var2=$prev '{FS=OFS="\t"; if($1 ~ /#CHROM/){gsub(var2, var1, $10)}; print $0}' $curr.vcf > tmp; mv tmp $curr.vcf; done
# import
# import
cut -f1,2,3,5 ../samples.txt | while read sample id pat type; do vcf=`ls ../torrent/*.somatic.vcf | grep $sample`; if [ "$vcf" != "" ]; then vtools import --build hg19 --format control/torrent_fap_vcf.fmt $vcf --sample_name $pat-$id-$type; fi; done
# export vtools report
less control/fap_torrent_report.header | vtools export variant --format control/fap_torrent_report.fmt --header - --output torrent.report --samples 'sample_name like "%polyp"'
# export novel 1% report
vtools select variant "(thousandGenomesEBI.EUR_AF_INFO is null or thousandGenomesEBI.EUR_AF_INFO < 0.01) and (evs.EuropeanAmericanMaf is null or 1.0*evs.EuropeanAmericanAltCount/(evs.EuropeanAmericanAltCount+evs.EuropeanAmericanRefCount) < 0.01)" -t variant_novel_01
less control/fap_torrent_report.header | vtools export variant_novel_01 --format control/fap_torrent_report.fmt --header - --output torrent_novel_01.report --samples 'sample_name like "%polyp"'
Expand Down
73 changes: 66 additions & 7 deletions chat.R
Expand Up @@ -106,6 +106,7 @@ for (i in 1:length(samples$V2)) {


#save segmentation data as .rdata
print("save segmentation data")
seg.data=paste0(sampleid,".Rdata")
seg_sAGP.data=paste0(sampleid,"_sAGP.Rdata")
agp.txt = paste0(sampleid, "_AGP.txt")
Expand All @@ -116,6 +117,7 @@ mode(dd.dat)='numeric'
save(dd.dat, file=seg.data)

#AGP inference
print("AGP inference")
para <- getPara()
para$datafile <- seg.data
para$thr.penalty <- 300
Expand All @@ -130,25 +132,80 @@ para.s$savedata <- seg_sAGP.data
getsAGP(para=para.s)

#CCF
print("get ccf")
load(seg_sAGP.data)
getCCF('vcf/',sAGPfile=seg_sAGP.data,nt=FALSE,output="Test.vcf",TCGA=FALSE,nc=10,tc=11,AD=2,filter=FALSE)

#fit
## @knitr plot1
# fit clusters
library(KernSmooth)
library(Ckmeans.1d.dp)
library(CHAT)

# read vcf
setwd("~/Analysis/fap/chat")
vcf = read.table("Test.vcf", sep="\t")

# read vtools report (downloaded from hap0)
# TODO: make new one on hpc
report = read.table("fap_mutect.report", header=T, sep="\t", quote="", na.strings="NA", fill=T)
annot=report[,c("chr", "hg19_pos", "ref", "alt", "id", "region_type", "region_name")]

# merge annotation and vcf
vcf = merge(vcf, annot, by.x=c("V1","V2","V4","V5"), by.y=c("chr", "hg19_pos", "ref", "alt"))

# extract ccf column
info = strsplit(as.character(vcf[,8]), ";")
dat = as.data.frame(matrix(unlist(strsplit(as.character(vcf[,8]), ";")), nrow=nrow(vcf), byrow=T))

# gene list
vogelstein = system("cat ~/Analysis/fap/crc_genes/vogelstein.txt", intern=T)

# dpfit priors
data(mcmc)
data(prior)
# each sample use dpfit
result = NULL

# 1 dimension k means cluster s
#TODO determine cluster with local minima instead of visual inspection
numClust = c(2,1,2,1,2,1,1,2,2,1,3,1,1,2,2,2,2)
for (i in unique(dat[,1])) {
i = "Vilar14"
# index for cluster
index = 1
#i = "Vilar14"
snv= vcf[which(dat$V1==i & dat$V6!="NA"),]

# sample and ccf not null
d=dat[which(dat$V1==i & dat$V6!="NA"),]
ccf = as.numeric(as.character(dat[which(dat$V1==i & dat$V6!="NA"),]$V6))

# genes label for plots
gene_name = as.character(snv[which(snv$region_type=="exonic"),]$region_name)
gene_y = ccf[which(snv$region_type=="exonic")]
gene_x = which(snv$region_type=="exonic")

# count snv
print(paste0(i, "=", length(ccf)))
print(paste0(i, " number of SNVs = ", length(ccf)))

# kernel density
#par(mfrow=c(2,1))
result <- bkde(x=ccf)
plot(result, xlab="Cancer Cell Fraction (CCF)", ylab="Density function", main=i)

result1 = Ckmeans.1d.dp(ccf, numClust[index])
plot(ccf, col = result1$cluster, xlab= "SNV", ylab="Cancer Cell Fraction (CCF)", main = "SNV Clustering")
text(gene_x, gene_y, gene_name)
index = index + 1


print(gene_name)
print("")
print("")
#TODO: doesn't always work when finding minimum...
#v=optimize(approxfun(result$x,result$y),interval=c(0,1))$minimum
#abline(v=v, col="blue")

# dpfit
if (FALSE) {
fit = getDPfit(ccf, alpha = 0.05, low.thr = 0.05,prior,mcmc)
numClones = 0
if (fit$model == 2) {
Expand All @@ -157,13 +214,14 @@ for (i in unique(dat[,1])) {
numClones = 1
}
result = rbind(result, c(i, numClones))
}
}
write.table(result, "clones.txt", sep="\t", quote=FALSE, row.names=FALSE)
?write.table


### TEST

##### TEST #####
if (FALSE) {
fit = kmeans(ccf, 5)
library(cluster)
clusplot(ccf, fit$cluster, color=TRUE, shade=TRUE,
Expand Down Expand Up @@ -426,4 +484,5 @@ for(k in 1:n){
tmp.dd<-tmp.vcf[vv,]
tmp.dd[,8]<-INFO
MutInfo<-rbind(MutInfo,tmp.dd)
}
}
64 changes: 51 additions & 13 deletions plots/allele_fraction.R
Expand Up @@ -2,7 +2,11 @@

library(ggplot2)
library(reshape2)
library(grid)
library(gridExtra)


setwd("/Users/kchang3/Analysis/fap/expands/snv-only")
# count loh after filtering
samples = system("ls *.ann.tsv | cut -d. -f1", inter=T)
for (i in 1:length(samples)) {
Expand All @@ -22,32 +26,66 @@ for (i in 1:length(samples)) {
}
}

genes=c("KRAS", "APC", "GNAS", "TP53", "AKT1", "SOX9", "ARID1A", "CNOT3", "CDC27", "FBXW7", "TCF7L2", "GNAQ")
# plot AF density/count
plot_list = c()
plot_list = NULL
for (i in 1:length(samples)) {
basename = samples[i]
a = read.table(paste(basename,"ann.tsv",sep="."), header=T, sep="\t")
a = read.table(paste0(basename,".sps.ann.tsv"), header=T, sep="\t", stringsAsFactors=F)
head(a)
b = a[which(a$PN_B==0 & a$f!="NA" & a$f <= 1),]
b=b[grep("exonic", b$region_type),]


# allele frequency
b = a
b = b[grep("exonic", b$region_type),]
# keep important gene names and plot them
b$region_name[b$region_name %in% genes == FALSE] = ""
# plot gene names with AF_Tumor (allele fraction)
p = ggplot(b) +
geom_density(aes(x=AF_Tumor, fill="AF"), alpha=.3) +
geom_density(aes(x=f, fill="Adj. AF"), alpha=.3) +
#geom_histogram(aes(x=AF_Tumor), binwidth=.01, fill="blue", alpha=.2) +
#geom_histogram(aes(x=f), binwidth=.01, fill="red", alpha=.2) +
xlab("Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14))
plot_list[[i]] = p
geom_histogram(aes(x=AF_Tumor), binwidth=.01, fill="blue", alpha=.2) +
geom_density(aes(x=AF_Tumor), alpha=.2) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(limits=c(0,15)) +
xlab("Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14)) +
geom_text(aes(x=AF_Tumor, y=1, label=region_name, angle=90))
p

# absolute AF
b = a
b = b[which(b$PN_B==0 & b$f!="NA" & a$f <= 1),]
b = b[grep("exonic", b$region_type),]
b$region_name[b$region_name %in% genes == FALSE] = ""

# number of clones
sp = length(sort(unique(a$SP[!is.na(a$SP)])))

p1 = ggplot(b) +
geom_histogram(aes(x=f), binwidth=.01, fill="red", alpha=.2) +
geom_density(aes(x=f), alpha=.2) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(limits=c(0,15)) +
xlab("Absolute Allele_Fraction") + ggtitle(basename) + theme(text=element_text(size=14)) +
geom_text(aes(x=f, y=1, label=region_name, angle=90))
p1

#g = arrangeGrob(p)
g = arrangeGrob(p,p1)
grid.arrange(g, ncol=1)
# save image
ggsave(paste0(basename, ".af.png"), g, width=10, height=15)

#plot_list[[i]] = p
# using melt
#c=cbind(b$AF_Tumor,b$f)
#colnames(c)=c('a','b')
#head(c)
#d=melt(c)
#ggplot(d, aes(x=value, fill=Var2)) + geom_density(alpha = .3)
}
library(grid)
library(gridExtra)

# plot all together
names(plot_list) = samples
png("test2.png", width=2000, height=2000)
arg_list <- c(plot_list, list(nrow=4, ncol=4))
do.call(grid.arrange, arg_list)
dev.off()
dev.off()
4 changes: 2 additions & 2 deletions plots/fap_clones_af.R
@@ -1,6 +1,6 @@
# read list of sample groupings
#library(ggplot2)
f = system("cat /Users/kyle_air/Dropbox/lab_vilar/fap/samples_group.txt", intern=T)
f = system("cat ~/Dropbox/lab_vilar/fap/samples_group.txt", intern=T)

for (i in 1:length(f)) {

Expand All @@ -18,7 +18,7 @@ for (i in 1:length(f)) {
par(mfrow=c(1,length(samples)))
for (i in 1:length(samples)) {
file_name = strsplit(samples[i], "_")[[1]][1]
fin = paste(file_name,"ann.tsv",sep=".")
fin = paste(file_name,"sps.ann.tsv",sep=".")
a = read.table(fin, sep="\t", header=T, stringsAsFactors=F)

sp = sort(unique(a$SP[!is.na(a$SP)]))
Expand Down
19 changes: 12 additions & 7 deletions plots/fap_gene_expression.R
Expand Up @@ -21,22 +21,22 @@ plot_heatmap = function(path, heatmap_name) {
# csVolcano(genes(t_vs_n_cuff_data),'colon_polyp','colon_normal', alpha=0.05, showSignificant=T)

# http://seqanswers.com/forums/showthread.php?t=19278
# stringent
sigGeneIds <- getSig(t_vs_n_cuff_data, level="genes", alpha=0.005)
# stringent filter for sig de genes id
sigGeneIds <- getSig(t_vs_n_cuff_data, level="genes", alpha=0.05)

# return gene object of a cuffset, then return diff gene dataframe - value1 and value2 are average FPKM in genes.read_group_tracking
t_vs_n_diff_genes <- diffData(genes(t_vs_n_cuff_data),features=TRUE)

# anthony filter for sig de genes id
t_vs_n_diff_genes_sig_ids <- t_vs_n_diff_genes[t_vs_n_diff_genes$q_value <= 0.05 & abs(t_vs_n_diff_genes$log2_fold_change) >= 1,]$gene_id
# cuffgeneset
t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, t_vs_n_diff_genes_sig_ids)
#csHeatmap(t_vs_n_de_genes,cluster='both', replicates=T)

# write de gene file by 1) getSig, 2) anthony's filter
write.table(t_vs_n_diff_genes[ t_vs_n_diff_genes$gene_id %in% sigGeneIds, ], file="de_gene_getSig.tsv", quote=FALSE, row.names=FALSE, col.names=TRUE, sep="\t")
write.table(t_vs_n_diff_genes[ t_vs_n_diff_genes$gene_id %in% t_vs_n_diff_genes_sig_ids, ], file="de_gene_anthony.tsv", quote=FALSE, row.names=FALSE, col.names=TRUE, sep="\t")

# returns "internal_scaled_frags" in file genes.read_group_tracking - Estimated number of fragments originating from the object, after transforming to the internal common count scale (for comparison between replicates of this condition.)
# cuffgeneset using anthony's filter for de genes
t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, t_vs_n_diff_genes_sig_ids)
#csHeatmap(t_vs_n_de_genes,cluster='both', replicates=T)
t_matrix <- repCountMatrix(t_vs_n_de_genes)

# cluster fragments count
Expand Down Expand Up @@ -64,7 +64,7 @@ plot_heatmap = function(path, heatmap_name) {
)
dev.off()

# getSig genes clustering
# getSig genes clustering using stringent alpha cutoff
t_vs_n_de_genes <- getGenes(t_vs_n_cuff_data, sigGeneIds)
t_matrix <- repCountMatrix(t_vs_n_de_genes)
# cluster fragments count
Expand Down Expand Up @@ -103,6 +103,11 @@ path="~/Analysis/fap/rnaseq/cdout/duodenum_polyp,duodenum_normal/"
heatmap_name="duodenum_polyp_normal_heatmap"
plot_heatmap(path, heatmap_name)

### lynch
path="~/Analysis/lynch/rnaseq-human/cdout/colon_polyp,colon_normal/"
heatmap_name="colon_polyp_normal_heatmap"
plot_heatmap(path, heatmap_name)

### polyp vs normal
#TODO: filter polyp matrix for colon and duodenum gene signature

Expand Down

0 comments on commit 86a9336

Please sign in to comment.